From 45333fa9d68d5bf8af7a811a45fff809d01cfe8b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 3 Jul 2024 09:43:20 -0400 Subject: [PATCH 01/39] basic benchmarks --- CMakeLists.txt | 19 +++++----- include/cufinufft/common.h | 4 ++ include/cufinufft/impl.h | 66 +++++++++++++++++++++------------ perftest/cuda/CMakeLists.txt | 1 + perftest/cuda/bench.sh | 13 +++++++ perftest/cuda/cuperftest.cu | 41 +++++++++++--------- src/cuda/1d/cufinufft1d.cu | 3 +- src/cuda/3d/spread3d_wrapper.cu | 16 +++++--- src/cuda/CMakeLists.txt | 17 ++++++++- src/cuda/common.cu | 28 ++++++++++++++ src/cuda/spreadinterp.cpp | 2 +- test/cuda/CMakeLists.txt | 8 ++++ 12 files changed, 156 insertions(+), 62 deletions(-) create mode 100644 perftest/cuda/bench.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index f53d6e28b..a6389f2ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.23) project(finufft VERSION 2.2.0 LANGUAGES C CXX) @@ -23,7 +23,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") - +set(FINUFFT_CUDA_ARCHITECTURES "all-major" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) @@ -219,30 +219,29 @@ if (FINUFFT_USE_CUDA) if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES) message("FINUFFT WARNING: No CUDA architecture supplied via '-DCMAKE_CUDA_ARCHITECTURES=...', defaulting to '60;70;75;'") message("See: https://developer.nvidia.com/cuda-gpus for more details on what architecture to supply.") - set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE) endif () enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING AND FINUFFT_BUILD_TESTS) + if (BUILD_TESTING OR FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) + add_subdirectory(test/cuda) endif () - list(APPEND INSTALL_TARGETS cufinufft cufinufft_static) endif () # Add tests defined in their own directory -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS)) add_subdirectory(test) add_subdirectory(perftest) endif () -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA) - add_subdirectory(test/cuda) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU) + add_subdirectory(examples) endif () -if (FINUFFT_BUILD_EXAMPLES) - add_subdirectory(examples) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU) + add_subdirectory(examples/cuda) endif () if (FINUFFT_BUILD_FORTRAN) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 7bddc188e..b45519a50 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -32,6 +32,10 @@ template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex *a, T *fwkerhalf, finufft_spread_opts opts); +template +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z); + } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 826319516..aa58c8dee 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -53,6 +53,7 @@ static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { } break; case 3: { switch (opts->gpu_method) { + case 0: case 1: case 2: { opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; @@ -109,17 +110,16 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } // Mult-GPU support: set the CUDA Device ID: - const int device_id = opts == NULL ? 0 : opts->gpu_device_id; + const int device_id = opts == nullptr ? 0 : opts->gpu_device_id; cufinufft::utils::WithCudaDevice device_swapper(device_id); /* allocate the plan structure, assign address to user pointer. */ - cufinufft_plan_t *d_plan = new cufinufft_plan_t; - *d_plan_ptr = d_plan; + auto *d_plan = new cufinufft_plan_t; + *d_plan_ptr = d_plan; // Zero out your struct, (sets all pointers to NULL) memset(d_plan, 0, sizeof(*d_plan)); - /* If a user has not supplied their own options, assign defaults for them. */ - if (opts == NULL) { // use default opts + if (opts == nullptr) { // use default opts cufinufft_default_opts(&(d_plan->opts)); } else { // or read from what's passed in d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect @@ -138,26 +138,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } auto &stream = d_plan->stream = (cudaStream_t)d_plan->opts.gpu_stream; - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster. - * However, in the special case of _double precision_ in _three dimensions_ - * with more than _three digits of precision_, there is note enough shared - * memory for this to work. As a result, we will default to method 1 (GM) in - * this special case. - * - * For type 2, we always default to method 1 (GM). */ - if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3)) - d_plan->opts.gpu_method = 2; - else if (type == 1 && tol < 1e-3) - d_plan->opts.gpu_method = 1; - else if (type == 2) - d_plan->opts.gpu_method = 1; - } - - /* Setup Spreader */ using namespace cufinufft::common; + /* Setup Spreader */ + // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) { delete *d_plan_ptr; @@ -180,6 +163,41 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran if (dim > 2) set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster. + * However, in the special case of _double precision_ in _three dimensions_ + * with more than _three digits of precision_, there is note enough shared + * memory for this to work. As a result, we will default to method 1 (GM) in + * this special case. + * + * For type 2, we always default to method 1 (GM). */ + + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, + device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = + shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block, + shared_mem_required); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + printf("choosing method 1\n"); + } else { + d_plan->opts.gpu_method = 2; + printf("choosing method 2\n"); + } + printf("using method %d\n", d_plan->opts.gpu_method); + } + int fftsign = (iflag >= 0) ? 1 : -1; d_plan->nf1 = nf1; diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 9d817d5f6..5f1079fde 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,3 +1,4 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) +#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) \ No newline at end of file diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh new file mode 100644 index 000000000..9832e1088 --- /dev/null +++ b/perftest/cuda/bench.sh @@ -0,0 +1,13 @@ +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10 diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index f72ffb3e6..85118f1f8 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -275,24 +275,29 @@ template void run_test(test_options_t &test_opts) { } const int64_t nupts_tot = M * test_opts.n_runs * ntransf; - - printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); - printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), - h2d_timer.mean(), h2d_timer.std()); - printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), - makeplan_timer.mean(), makeplan_timer.std()); - printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), - setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(), - setpts_timer.tot() * 1E6 / nupts_tot); - printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), - execute_timer.mean(), execute_timer.std(), - nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); - printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), - d2h_timer.mean(), d2h_timer.std()); - printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), - amortized_timer.mean(), amortized_timer.std(), - nupts_tot * 1000 / amortized_timer.tot(), - amortized_timer.tot() * 1E6 / nupts_tot); + // + // printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); + // printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), + // h2d_timer.mean(), h2d_timer.std()); + // printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), + // makeplan_timer.tot(), + // makeplan_timer.mean(), makeplan_timer.std()); + // printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), + // setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / + // setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot); + // printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), + // execute_timer.mean(), execute_timer.std(), + // nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / + // nupts_tot); + // printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), + // d2h_timer.mean(), d2h_timer.std()); + // printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), + // amortized_timer.mean(), amortized_timer.std(), + // nupts_tot * 1000 / amortized_timer.tot(), + // amortized_timer.tot() * 1E6 / nupts_tot); + // print numpts / s + printf("setpts pts/s: %g\n", float(nupts_tot) * 1000 / setpts_timer.tot()); + printf("execute pts/s: %g\n", float(nupts_tot) * 1000 / execute_timer.tot()); } int main(int argc, char *argv[]) { diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 26eaff491..4ecb3b283 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -5,11 +5,10 @@ #include #include -#include +#include #include #include -#include #include #include diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index fa67f95f8..c25393e1a 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -536,14 +536,17 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" - << sharedplanorysize << ")" << std::endl; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + // if (sharedplanorysize > 49152) { + // std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" + // << sharedplanorysize << ")" << std::endl; + // return FINUFFT_ERR_INSUFFICIENT_SHMEM; + // } for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { + cudaFuncSetAttribute(spread_3d_subprob, + cudaFuncAttributeMaxDynamicSharedMemorySize, + sharedplanorysize); spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -551,6 +554,9 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + cudaFuncSetAttribute(spread_3d_subprob, + cudaFuncAttributeMaxDynamicSharedMemorySize, + sharedplanorysize); spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 62d6c901c..d2928858b 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -24,26 +24,38 @@ set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_common_objects PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_objects PROPERTIES + POSITION_INDEPENDENT_CODE ON + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) add_library(cufinufft SHARED $ $ ) +target_include_directories(cufinufft PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cufinufft CUDA::cudart CUDA::cufft CUDA::nvToolsExt) set_target_properties( cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) add_library(cufinufft_static STATIC $ $ ) +target_include_directories(cufinufft_static PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) if(WIN32) target_link_libraries(cufinufft_static PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) else() @@ -51,7 +63,8 @@ else() endif() set_target_properties( cufinufft_static PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" ) file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") diff --git a/src/cuda/common.cu b/src/cuda/common.cu index c6bf8315d..7709cdf74 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -199,6 +199,28 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex +std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, + int bin_size_z) { + printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, + bin_size_x, bin_size_y, bin_size_z); + int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; + + if (dim == 1) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_y + ((ns + 1) / 2) * 2); + + if (dim == 2) { + return adjusted_ns * sizeof(cuda_complex); + } + + adjusted_ns *= (bin_size_z + ((ns + 1) / 2) * 2); + + return adjusted_ns * sizeof(cuda_complex); +} + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, finufft_spread_opts opts); @@ -227,5 +249,11 @@ template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, float *fwkerhalf, finufft_spread_opts opts); template void onedim_fseries_kernel(CUFINUFFT_BIGINT nf, double *fwkerhalf, finufft_spread_opts opts); + +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); +template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, + int bin_size_y, int bin_size_z); + } // namespace common } // namespace cufinufft diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index 6ff91f8ca..b01d1c98f 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -69,7 +69,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet ier = FINUFFT_WARN_EPS_TOO_SMALL; } opts.nspread = ns; - opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner) + opts.ES_halfwidth = T(ns * .5); // constants to help ker eval (except Horner) opts.ES_c = 4.0 / (T)(ns * ns); T betaoverns = 2.30; // gives decent betas for default sigma=2.0 diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 23b3346da..8d77d9fdc 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,6 +7,14 @@ foreach(srcfile ${test_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) + set_target_properties(${executable} PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + ) + message(STATUS "Adding test ${executable}" + " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" + " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}" + ) endforeach() function(add_tests PREC REQ_TOL CHECK_TOL) From b95a0826a6adfcbc1c81cd46576b3006633124b4 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 3 Jul 2024 22:13:31 -0400 Subject: [PATCH 02/39] added plotting script --- CMakeLists.txt | 2 +- perftest/cuda/bench.py | 106 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+), 1 deletion(-) create mode 100644 perftest/cuda/bench.py diff --git a/CMakeLists.txt b/CMakeLists.txt index a6389f2ec..15e6161a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") -set(FINUFFT_CUDA_ARCHITECTURES "all-major" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") +set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py new file mode 100644 index 000000000..8812f10a4 --- /dev/null +++ b/perftest/cuda/bench.py @@ -0,0 +1,106 @@ +import matplotlib.pyplot as plt +import os +import subprocess +import pandas as pd +import numpy as np + +cwd = os.getcwd() + + +# function that runs a command line command and returns the output +# it also takes a list of arguments to pass to the command +def run_command(command, args): + # convert command and args to a string + try: + cmd = [command] + args + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout + except subprocess.CalledProcessError as e: + print('stdout output:\n', e.stdout) + print('stderr output:\n', e.stderr) + print("Error executing command:", e) + + +# function that builds a string from a dictionary of arguments + +def build_args(args): + args_list = [] + for key, value in args.items(): + args_list.append(key + " " + value) + return ' '.join(args_list) + + +# function + +# example command to run: +# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 +# example arguments +args = {"--prec": "f", + "--n_runs": "1", + "--method": "1", + "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--M": "1E8", + "--tol": "1E-6"} +# iterate over tol from 1E-6 to 1E-1 +data = { + 'method': [], + 'throughput': [], + 'tolerance': [] +} +for i in range(1, 7): + args["--tol"] = "1E-" + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] + run_command("nsys", cmd) + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + csv = run_command("nsys", cmd) + print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_kern_sum.csv") + # sort dt by column "Time (%)" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + dt = dt.sort_values(by="Time (%)", ascending=False) + # drop all the rows with spread not in "Name" + time = dt["Avg (ns)"].values[0] + # pt/s + throughput = float(args['--M']) * 1_000_000_000 / time + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + +df = pd.DataFrame(data) + +# Pivot the DataFrame +pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') +# Plot +pivot_df.plot(kind='bar', figsize=(10, 7)) +# Find the minimum throughput value +min_throughput = df['throughput'].min() + +# Calculate the smallest power of 10 +min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) + +# Adjust the plot's y-axis limits +plt.ylim(df['throughput'].min()*.95, df['throughput'].max() * 1.05) # Adding 10% for upper margin + +plt.xlabel('Tolerance') +plt.ylabel('Throughput') +plt.title('Throughput by Tolerance and Method') +plt.legend(title='Method') +plt.tight_layout() +plt.show() +plt.xlabel("Tolerance") +plt.ylabel("Points/s") +plt.savefig("bench.png") +plt.savefig("bench.svg") +plt.savefig("bench.pdf") +plt.show() \ No newline at end of file From ae55ca5b96b7167831ccdc0a2e2211b6297753bf Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 12:09:20 -0400 Subject: [PATCH 03/39] optimised plotting --- perftest/cuda/bench.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8812f10a4..5857b5ede 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -65,20 +65,27 @@ def build_args(args): cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] csv = run_command("nsys", cmd) - print(csv) - dt = pd.read_csv("./cuperftest_cuda_gpu_kern_sum.csv") - # sort dt by column "Time (%)" - dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] - dt = dt.sort_values(by="Time (%)", ascending=False) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') # drop all the rows with spread not in "Name" - time = dt["Avg (ns)"].values[0] + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') # pt/s - throughput = float(args['--M']) * 1_000_000_000 / time + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') data['throughput'].append(throughput) data['tolerance'].append(args['--tol']) df = pd.DataFrame(data) - +print(df) # Pivot the DataFrame pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') # Plot @@ -90,7 +97,7 @@ def build_args(args): min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) # Adjust the plot's y-axis limits -plt.ylim(df['throughput'].min()*.95, df['throughput'].max() * 1.05) # Adding 10% for upper margin +plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.09) # Adding 10% for upper margin plt.xlabel('Tolerance') plt.ylabel('Throughput') From 16e27f0575a930633803c13ea274fd8182c4a064 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 12:28:34 -0400 Subject: [PATCH 04/39] fixed plotting and metrics --- perftest/cuda/bench.py | 15 +++++++++++--- perftest/cuda/cuperftest.cu | 41 ++++++++++++++++--------------------- 2 files changed, 30 insertions(+), 26 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 5857b5ede..88ef0679b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -3,7 +3,7 @@ import subprocess import pandas as pd import numpy as np - +import io cwd = os.getcwd() @@ -61,10 +61,19 @@ def build_args(args): data['method'].append('SM') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] - run_command("nsys", cmd) + stdout = run_command("nsys", cmd) + # skip all lines starting with # in stdout + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] - csv = run_command("nsys", cmd) + stdout = run_command("nsys", cmd) # print(csv) dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") # print(dt) diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu index 85118f1f8..f72ffb3e6 100644 --- a/perftest/cuda/cuperftest.cu +++ b/perftest/cuda/cuperftest.cu @@ -275,29 +275,24 @@ template void run_test(test_options_t &test_opts) { } const int64_t nupts_tot = M * test_opts.n_runs * ntransf; - // - // printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); - // printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), - // h2d_timer.mean(), h2d_timer.std()); - // printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), - // makeplan_timer.tot(), - // makeplan_timer.mean(), makeplan_timer.std()); - // printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), - // setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / - // setpts_timer.tot(), setpts_timer.tot() * 1E6 / nupts_tot); - // printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), - // execute_timer.mean(), execute_timer.std(), - // nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / - // nupts_tot); - // printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), - // d2h_timer.mean(), d2h_timer.std()); - // printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), - // amortized_timer.mean(), amortized_timer.std(), - // nupts_tot * 1000 / amortized_timer.tot(), - // amortized_timer.tot() * 1E6 / nupts_tot); - // print numpts / s - printf("setpts pts/s: %g\n", float(nupts_tot) * 1000 / setpts_timer.tot()); - printf("execute pts/s: %g\n", float(nupts_tot) * 1000 / execute_timer.tot()); + + printf("event,count,tot(ms),mean(ms),std(ms),nupts/s,ns/nupt\n"); + printf("host_to_device,%d,%f,%f,%f,0.0,0.0\n", h2d_timer.count(), h2d_timer.tot(), + h2d_timer.mean(), h2d_timer.std()); + printf("makeplan,%d,%f,%f,%f,0.0,0.0\n", makeplan_timer.count(), makeplan_timer.tot(), + makeplan_timer.mean(), makeplan_timer.std()); + printf("setpts,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, setpts_timer.tot(), + setpts_timer.mean(), setpts_timer.std(), nupts_tot * 1000 / setpts_timer.tot(), + setpts_timer.tot() * 1E6 / nupts_tot); + printf("execute,%d,%f,%f,%f,%g,%f\n", test_opts.n_runs, execute_timer.tot(), + execute_timer.mean(), execute_timer.std(), + nupts_tot * 1000 / execute_timer.tot(), execute_timer.tot() * 1E6 / nupts_tot); + printf("device_to_host,%d,%f,%f,%f,0.0,0.0\n", d2h_timer.count(), d2h_timer.tot(), + d2h_timer.mean(), d2h_timer.std()); + printf("amortized,%d,%f,%f,%f,%g,%f\n", 1, amortized_timer.tot(), + amortized_timer.mean(), amortized_timer.std(), + nupts_tot * 1000 / amortized_timer.tot(), + amortized_timer.tot() * 1E6 / nupts_tot); } int main(int argc, char *argv[]) { From 49d1f21c095704277932b3f3c204ab0f70fc58f3 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 15:28:32 -0400 Subject: [PATCH 05/39] fixed the plot script --- include/cufinufft/impl.h | 2 +- perftest/cuda/bench.py | 53 ++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index aa58c8dee..a53f58c82 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -42,7 +42,7 @@ int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { switch (dim) { case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16384 : opts->gpu_binsizex; opts->gpu_binsizey = 1; opts->gpu_binsizez = 1; } break; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 88ef0679b..def6e8303 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -13,6 +13,7 @@ def run_command(command, args): # convert command and args to a string try: cmd = [command] + args + print("Running command:", ' '.join(cmd)) result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) return result.stdout except subprocess.CalledProcessError as e: @@ -26,8 +27,9 @@ def run_command(command, args): def build_args(args): args_list = [] for key, value in args.items(): - args_list.append(key + " " + value) - return ' '.join(args_list) + args_list.append(key) + args_list.append(value) + return args_list # function @@ -36,9 +38,9 @@ def build_args(args): # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments args = {"--prec": "f", - "--n_runs": "1", + "--n_runs": "5", "--method": "1", - "--N1": "256", + "--N1": "65536", # "--N2": "256", # "--N3": "256", "--M": "1E8", @@ -47,7 +49,9 @@ def build_args(args): data = { 'method': [], 'throughput': [], - 'tolerance': [] + 'tolerance': [], + # 'setpts': [], + 'exec': [], } for i in range(1, 7): args["--tol"] = "1E-" + str(i) @@ -60,15 +64,17 @@ def build_args(args): elif method == '2': data['method'].append('SM') print("Method " + data['method'][-1]) - cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest", build_args(args)] + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) stdout = run_command("nsys", cmd) # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] stdout = '\n'.join(stdout) # convert stdout to a dataframe from csv string dt = pd.read_csv(io.StringIO(stdout), sep=',') - setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() - exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value print(f'setpts pts/s: {setpts}') print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", @@ -84,6 +90,7 @@ def build_args(args): # drop all the rows with spread not in "Name" dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] # print(dt) + # exit(0) # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft print(f'total_spread: {total_spread}') @@ -92,30 +99,46 @@ def build_args(args): print(f'throughput: {throughput}') data['throughput'].append(throughput) data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) + df = pd.DataFrame(data) -print(df) # Pivot the DataFrame -pivot_df = df.pivot(index='tolerance', columns='method', values='throughput') +pivot_df = df.pivot(index='tolerance', columns='method') +# print(pivot_df) +# scale the throughput SM by GM +pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] +# scale setpts SM by GM +pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] +# remove the GM column +pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_throughput = df['throughput'].min() +min_val = min(df['throughput'].min(), df['exec'].min()) +max_val = max(df['throughput'].max(), df['exec'].max()) +plt.ylim(.8, 1.2) # Calculate the smallest power of 10 -min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) +# min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) # Adjust the plot's y-axis limits -plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.09) # Adding 10% for upper margin +# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin +# plot an horizontal line at 1 with label "GM" +plt.axhline(y=1, color='k', linestyle='--', label='GM') plt.xlabel('Tolerance') -plt.ylabel('Throughput') +plt.ylabel('Throughput (% of GM)') plt.title('Throughput by Tolerance and Method') plt.legend(title='Method') plt.tight_layout() plt.show() plt.xlabel("Tolerance") -plt.ylabel("Points/s") +plt.ylabel("Points/s (% of GM)") plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") From 2fdae684b2a6044f1d5bca9302666779b6272fd5 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 19:19:48 -0400 Subject: [PATCH 06/39] bin_size_x is as function of the shared memory available --- include/cufinufft/common.h | 28 ++++++++++++++ include/cufinufft/impl.h | 36 +----------------- perftest/cuda/bench.py | 47 ++++++++++++++++++----- src/cuda/1d/spread1d_wrapper.cu | 14 ++++--- src/cuda/3d/spread3d_wrapper.cu | 18 +++------ src/cuda/common.cu | 67 ++++++++++++++++++++++++++++++++- 6 files changed, 146 insertions(+), 64 deletions(-) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index b45519a50..33d8a0d86 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -36,6 +36,34 @@ template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts); + +template +auto cufinufft_set_shared_memory(V *kernel, const int dim, + const cufinufft_plan_t &d_plan) { + int device_id; + cudaGetDevice(&device_id); + const auto shared_mem_required = + shared_memory_required(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, + d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez); + int shared_mem_per_block{}; + const auto err = cudaDeviceGetAttribute( + &shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (err != cudaSuccess) { + return err; + } + if (shared_mem_required > shared_mem_per_block) { + fprintf(stderr, + "Error: Shared memory required per block is %zu bytes, but the device " + "supports only %d bytes.\n", + shared_mem_required, shared_mem_per_block); + return err; + } + return cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_mem_required); +} + } // namespace common } // namespace cufinufft #endif diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index a53f58c82..4a1c6ae31 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -39,40 +39,6 @@ template int cufinufft3d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan); -static void cufinufft_setup_binsize(int type, int dim, cufinufft_opts *opts) { - switch (dim) { - case 1: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16384 : opts->gpu_binsizex; - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; - opts->gpu_binsizez = 1; - } break; - case 3: { - switch (opts->gpu_method) { - case 0: - case 1: - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; - } break; - case 4: { - opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; - opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; - opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; - } break; - } - } break; - } -} - template int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntransf, T tol, cufinufft_plan_t **d_plan_ptr, cufinufft_opts *opts) { @@ -153,7 +119,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mt = nmodes[1]; d_plan->mu = nmodes[2]; - cufinufft_setup_binsize(type, dim, &d_plan->opts); + cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index def6e8303..1e1f4838e 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -15,7 +15,7 @@ def run_command(command, args): cmd = [command] + args print("Running command:", ' '.join(cmd)) result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) - return result.stdout + return result.stdout, result.stderr except subprocess.CalledProcessError as e: print('stdout output:\n', e.stdout) print('stderr output:\n', e.stderr) @@ -38,9 +38,9 @@ def build_args(args): # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments args = {"--prec": "f", - "--n_runs": "5", - "--method": "1", - "--N1": "65536", + "--n_runs": "10", + "--method": "0", + "--N1": "16777216", # "--N2": "256", # "--N3": "256", "--M": "1E8", @@ -53,10 +53,26 @@ def build_args(args): # 'setpts': [], 'exec': [], } +warmup = {"--prec": "f", + "--n_runs": "1", + "--method": "0", + "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--M": "256", + "--tol": "1E-1"} +cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup) +print("Warmup") +stdout, stderr = run_command("nsys", cmd) +print("Benchmarking") +if stderr != '': + print(stderr) + exit(0) for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) for method in ['2', '1']: + args["--method"] = method if method == '0': data['method'].append('auto') elif method == '1': @@ -65,7 +81,10 @@ def build_args(args): data['method'].append('SM') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) - stdout = run_command("nsys", cmd) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) # skip all lines starting with # in stdout conf = [x for x in stdout.splitlines() if x.startswith("#")] print('\n'.join(conf)) @@ -79,7 +98,10 @@ def build_args(args): print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] - stdout = run_command("nsys", cmd) + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) # print(csv) dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") # print(dt) @@ -94,6 +116,9 @@ def build_args(args): # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) # pt/s throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread print(f'throughput: {throughput}') @@ -116,12 +141,16 @@ def build_args(args): # remove the GM column pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) + +print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(df['throughput'].min(), df['exec'].min()) -max_val = max(df['throughput'].max(), df['exec'].max()) -plt.ylim(.8, 1.2) +min_val = min(pivot_df[('exec', 'SM')].min(), pivot_df[('throughput', 'SM')].min(), 1) +max_val = max(pivot_df[('exec', 'SM')].max(), pivot_df[('throughput', 'SM')].max(), 0) +print(min_val, max_val) +plt.ylim(min_val * .99, max_val * 1.01) +# plt.ylim(.8, 1.2) # Calculate the smallest power of 10 # min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 26fd5024c..36fa2bef9 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -251,15 +252,14 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = - (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(1, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, @@ -268,6 +268,8 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, d_subprob_to_bin, d_subprobstartpts, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index c25393e1a..6c851389c 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -7,9 +7,11 @@ #include #include +#include #include #include #include + using namespace cufinufft::common; using namespace cufinufft::memtransfer; @@ -536,17 +538,10 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - // if (sharedplanorysize > 49152) { - // std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" - // << sharedplanorysize << ")" << std::endl; - // return FINUFFT_ERR_INSUFFICIENT_SHMEM; - // } - for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { - cudaFuncSetAttribute(spread_3d_subprob, - cudaFuncAttributeMaxDynamicSharedMemorySize, - sharedplanorysize); + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -554,9 +549,8 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { - cudaFuncSetAttribute(spread_3d_subprob, - cudaFuncAttributeMaxDynamicSharedMemorySize, - sharedplanorysize); + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 7709cdf74..5e32cb101 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -202,8 +202,8 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { - printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, - bin_size_x, bin_size_y, bin_size_z); + // printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, + // bin_size_x, bin_size_y, bin_size_z); int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; if (dim == 1) { @@ -221,6 +221,65 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } +template +void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + int shared_mem_per_block{}, device_id{}; + switch (dim) { + case 1: { + switch (opts->gpu_method) { + case 0: + case 1: + case 2: + if (opts->gpu_binsizex < 0) { + cudaGetDevice(&device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + // find the power of 2 that is less than bin_size + const int exponent = std::log2(bin_size); + opts->gpu_binsizex = 1 << (exponent - 1); + // printf("bin_size: %d, gpu_binsizex: %d\n", bin_size, + // opts->gpu_binsizex); + } + break; + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + opts->gpu_binsizez = 1; + } break; + case 3: { + switch (opts->gpu_method) { + case 0: + case 1: + case 2: { + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; + } break; + case 4: { + opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; + opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; + opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; + } break; + } + } break; + } +} + template void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, float *f, std::complex *a, float *fwkerhalf, finufft_spread_opts opts); @@ -255,5 +314,9 @@ template std::size_t shared_memory_required(int dim, int ns, int bin_size template std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z); +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); +template void cufinufft_setup_binsize(int type, int ns, int dim, + cufinufft_opts *opts); } // namespace common } // namespace cufinufft From c0d992377dc66808d53b0bc5ebe9f8aae3f33fa4 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 8 Jul 2024 19:22:17 -0400 Subject: [PATCH 07/39] bin_size_x is as function of the shared memory available --- perftest/cuda/bench.py | 1 + 1 file changed, 1 insertion(+) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 1e1f4838e..5269a3f45 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -56,6 +56,7 @@ def build_args(args): warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", + "--sort": "0", "--N1": "256", # "--N2": "256", # "--N3": "256", From 907797c82fe6ce839385348644f77d11cd5b4a34 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 9 Jul 2024 14:19:23 -0400 Subject: [PATCH 08/39] minor optimizations in 1D --- .../contrib/ker_horner_allw_loop.inc | 362 +++++++++--------- include/cufinufft/spreadinterp.h | 1 + perftest/cuda/bench.py | 13 +- src/cuda/1d/spreadinterp1d.cuh | 285 +++++++------- src/cuda/common.cu | 4 +- 5 files changed, 337 insertions(+), 328 deletions(-) diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 32f2cff00..f905c14f0 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -2,215 +2,215 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) 2018, The Simons Foundation, Inc. if (w==2) { - CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; - CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; - CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; - CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; - CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; - CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; + constexpr CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; + constexpr CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; + constexpr CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; + constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; + constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; + constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==3) { - CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; - CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; - CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; - CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; - CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; - CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; - CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; + constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; + constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; + constexpr CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; + constexpr CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; + constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; + constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; + constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==4) { - CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; - CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; - CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; - CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; - CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; - CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; - CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; - CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; + constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; + constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; + constexpr CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; + constexpr CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; + constexpr CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; + constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; + constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; + constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==5) { - CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; - CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; - CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; - CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; - CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; - CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; - CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; - CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; - CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; + constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; + constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; + constexpr CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; + constexpr CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; + constexpr CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; + constexpr CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; + constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; + constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; + constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==6) { - CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; - CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; - CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; - CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; - CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; - CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; - CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; - CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; - CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; - CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; + constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; + constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; + constexpr CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; + constexpr CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; + constexpr CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; + constexpr CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; + constexpr CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; + constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; + constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; + constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==7) { - CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; - CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; - CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; - CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; - CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; - CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; - CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; - CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; - CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; - CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; - CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; + constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; + constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; + constexpr CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; + constexpr CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; + constexpr CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; + constexpr CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; + constexpr CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; + constexpr CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; + constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; + constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; + constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==8) { - CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; - CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; - CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; - CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; - CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; - CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; - CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; - CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; - CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; - CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; - CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; + constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; + constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; + constexpr CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; + constexpr CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; + constexpr CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; + constexpr CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; + constexpr CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; + constexpr CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; + constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; + constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; + constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==9) { - CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; - CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; - CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; - CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; - CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; - CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; - CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; - CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; - CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; - CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; - CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; - CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; + constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; + constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; + constexpr CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; + constexpr CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; + constexpr CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; + constexpr CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; + constexpr CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; + constexpr CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; + constexpr CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; + constexpr CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; + constexpr CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; + constexpr CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==10) { - CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; - CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; - CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; - CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; - CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; - CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; - CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; - CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; - CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; - CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; - CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; - CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; - CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; + constexpr CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; + constexpr CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; + constexpr CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; + constexpr CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; + constexpr CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; + constexpr CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; + constexpr CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; + constexpr CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; + constexpr CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; + constexpr CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; + constexpr CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; + constexpr CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; + constexpr CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==11) { - CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; - CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; - CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; - CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; - CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; - CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; - CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; - CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; - CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; - CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; - CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; - CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; - CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; - CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; + constexpr CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; + constexpr CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; + constexpr CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; + constexpr CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; + constexpr CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; + constexpr CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; + constexpr CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; + constexpr CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; + constexpr CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; + constexpr CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; + constexpr CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; + constexpr CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; + constexpr CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; + constexpr CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==12) { - CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; - CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; - CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; - CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; - CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; - CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; - CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; - CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; - CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; - CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; - CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; - CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; - CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; - CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; + constexpr CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; + constexpr CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; + constexpr CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; + constexpr CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; + constexpr CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; + constexpr CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; + constexpr CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; + constexpr CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; + constexpr CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; + constexpr CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; + constexpr CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; + constexpr CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; + constexpr CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; + constexpr CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; - CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; - CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; - CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; - CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; - CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; - CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; - CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; - CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; - CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; - CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; - CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; - CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; - CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; - CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; + constexpr CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; + constexpr CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; + constexpr CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; + constexpr CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; + constexpr CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; + constexpr CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; + constexpr CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; + constexpr CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; + constexpr CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; + constexpr CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; + constexpr CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; + constexpr CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; + constexpr CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; + constexpr CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; + constexpr CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; - CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; - CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; - CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; - CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; - CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; - CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; - CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; - CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; - CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; - CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; - CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; - CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; - CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; - CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; + constexpr CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; + constexpr CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; + constexpr CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; + constexpr CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; + constexpr CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; + constexpr CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; + constexpr CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; + constexpr CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; + constexpr CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; + constexpr CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; + constexpr CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; + constexpr CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; + constexpr CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; + constexpr CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; + constexpr CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; - CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; - CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; - CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; - CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; - CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; - CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; - CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; - CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; - CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; - CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; - CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; - CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; - CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; - CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; - CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; - CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; + constexpr CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; + constexpr CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; + constexpr CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; + constexpr CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; + constexpr CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; + constexpr CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; + constexpr CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; + constexpr CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; + constexpr CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; + constexpr CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; + constexpr CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; + constexpr CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; + constexpr CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; + constexpr CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; + constexpr CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; + constexpr CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; + constexpr CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; - CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; - CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; - CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; - CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; - CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; - CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; - CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; - CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; - CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; - CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; - CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; - CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; - CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; - CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; - CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; - CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; - CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; + constexpr CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; + constexpr CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; + constexpr CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; + constexpr CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; + constexpr CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; + constexpr CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; + constexpr CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; + constexpr CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; + constexpr CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; + constexpr CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; + constexpr CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; + constexpr CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; + constexpr CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; + constexpr CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; + constexpr CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; + constexpr CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; + constexpr CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; + constexpr CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index da1c59930..d2f1ecd2d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -2,6 +2,7 @@ #define __CUSPREADINTERP_H__ #include +#include #include #include diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 5269a3f45..bb288af0b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -40,8 +40,10 @@ def build_args(args): args = {"--prec": "f", "--n_runs": "10", "--method": "0", - "--N1": "16777216", - # "--N2": "256", + "--sort": "1", + # "--N1": "16777216", + "--N1": "256", + "--N2": "256", # "--N3": "256", "--M": "1E8", "--tol": "1E-6"} @@ -56,7 +58,6 @@ def build_args(args): warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", - "--sort": "0", "--N1": "256", # "--N2": "256", # "--N3": "256", @@ -142,13 +143,13 @@ def build_args(args): # remove the GM column pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) - +pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(pivot_df[('exec', 'SM')].min(), pivot_df[('throughput', 'SM')].min(), 1) -max_val = max(pivot_df[('exec', 'SM')].max(), pivot_df[('throughput', 'SM')].max(), 0) +min_val = min(pivot_df[('throughput', 'SM')].min(), 1) +max_val = max(pivot_df[('throughput', 'SM')].max(), 0) print(min_val, max_val) plt.ylim(min_val * .99, max_val * 1.01) # plt.ylim(.8, 1.2) diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 24b4fb9d2..9a536ec9c 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -5,9 +5,11 @@ #include #include +#include #include #include #include + using namespace cufinufft::utils; namespace cufinufft { @@ -15,164 +17,167 @@ namespace spreadinterp { /* ------------------------ 1d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *idxnupts) { - int xx, ix; - T ker1[MAX_NSPREAD]; - - T x_rescaled; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - cnow = c[idxnupts[i]]; - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue = ker1[xx - xstart]; - atomicAdd(&fw[ix].x, cnow.x * kervalue); - atomicAdd(&fw[ix].y, cnow.y * kervalue); - } +template +__global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, + cuda_complex *fw, int M, int ns, int nf1, T es_c, + T es_beta, T sigma, const int *idxnupts) { + int xx, ix; + T ker1[MAX_NSPREAD]; + + T x_rescaled; + cuda_complex cnow; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + cnow = c[idxnupts[i]]; + int xstart = ceil(x_rescaled - ns / 2.0); + int xend = floor(x_rescaled + ns / 2.0); + + T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + + for (xx = xstart; xx <= xend; xx++) { + ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + T kervalue = ker1[xx - xstart]; + atomicAdd(&fw[ix].x, cnow.x * kervalue); + atomicAdd(&fw[ix].y, cnow.y * kervalue); } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, int *bin_size, const T *x, - int *sortidx) { - int binx; - int oldidx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - oldidx = atomicAdd(&bin_size[binx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx) { - sortidx[i] = -binx; - } +template +__global__ void calc_bin_size_noghost_1d(int M, int nf1, int bin_size_x, int nbinx, + int *bin_size, const T *x, int *sortidx) { + int binx; + int oldidx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + oldidx = atomicAdd(&bin_size[binx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx) { + sortidx[i] = -binx; } + } } -template -__global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nbinx, const int *bin_startpts, - const int *sortidx, const T *x, int *index, int nf1) { - int binx; - T x_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - index[bin_startpts[binx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_idx_1d( + int M, int bin_size_x, int nbinx, const int *bin_startpts, const int *sortidx, + const T *x, int *index, int nf1) { + int binx; + T x_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + index[bin_startpts[binx] + sortidx[i]] = i; + } } -template -__global__ void spread_1d_subprob(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, - int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, xend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_1d_subprob( + const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, + T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, + int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + + int xstart, xend, ix; + const int subpidx = blockIdx.x; + const int bidx = subprob_to_bin[subpidx]; + const int binsubp_idx = subpidx - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + const int xoffset = (bidx % nbinx) * bin_size_x; + const auto ns_2 = (ns + 1) / 2; + const int N = bin_size_x + 2 * ns_2; + + T ker1[MAX_NSPREAD]; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i].x = T(0); + fwshared[i].y = T(0); + } + __syncthreads(); + + for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + + xstart = ceil(x_rescaled - ns / 2.0) - xoffset; + xend = floor(x_rescaled + ns / 2.0) - xoffset; + + const T x1 = T(xstart + xoffset) - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + for (int xx = xstart; xx <= xend; xx++) { + ix = xx + ns_2; + if (ix >= (bin_size_x + ns_2) || ix < 0) break; + atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); + atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); } - __syncthreads(); - - T x_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); - atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); - } - } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - ix = xoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - atomicAdd(&fw[ix].x, fwshared[k].x); - atomicAdd(&fw[ix].y, fwshared[k].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + ix = xoffset - ns_2 + k; + if (ix < (nf1 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + atomicAdd(&fw[ix].x, fwshared[k].x); + atomicAdd(&fw[ix].y, fwshared[k].y); } + } } /* --------------------- 1d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, +template +__global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, + const cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - T ker1[MAX_NSPREAD]; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T x1 = (T)xstart - x_rescaled; - if constexpr (KEREVALMETH == 1) - eval_kernel_vec_horner(ker1, x1, ns, sigma); - else - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[ix].x * kervalue1; - cnow.y += fw[ix].y * kervalue1; - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + T ker1[MAX_NSPREAD]; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + + int xstart = ceil(x_rescaled - ns / 2.0); + int xend = floor(x_rescaled + ns / 2.0); + cuda_complex cnow; + cnow.x = 0.0; + cnow.y = 0.0; + + T x1 = (T)xstart - x_rescaled; + if constexpr (KEREVALMETH == 1) + eval_kernel_vec_horner(ker1, x1, ns, sigma); + else + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + + for (int xx = xstart; xx <= xend; xx++) { + int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + T kervalue1 = ker1[xx - xstart]; + cnow.x += fw[ix].x * kervalue1; + cnow.y += fw[ix].y * kervalue1; } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 5e32cb101..f5661b1dd 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -227,8 +227,10 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { switch (dim) { case 1: { switch (opts->gpu_method) { - case 0: case 1: + opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; + break; + case 0: case 2: if (opts->gpu_binsizex < 0) { cudaGetDevice(&device_id); From 60f478033ccc9fe9ac5258e1c64a5c91230c9c40 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 12 Jul 2024 15:25:12 -0400 Subject: [PATCH 09/39] otpimized nupts driven --- include/cufinufft/contrib/helper_cuda.h | 15 +-- .../contrib/ker_horner_allw_loop.inc | 28 ++++-- include/cufinufft/spreadinterp.h | 53 +++++++++-- perftest/cuda/bench.py | 18 ++-- src/cuda/1d/spread1d_wrapper.cu | 92 ++++++++++--------- src/cuda/1d/spreadinterp1d.cuh | 69 +++++++++----- src/cuda/CMakeLists.txt | 16 ++++ 7 files changed, 199 insertions(+), 92 deletions(-) diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h index 3dade898e..c3a31bd2b 100644 --- a/include/cufinufft/contrib/helper_cuda.h +++ b/include/cufinufft/contrib/helper_cuda.h @@ -58,13 +58,14 @@ static inline cudaError_t cudaFreeWrapper(T *devPtr, cudaStream_t stream, return pool_supported ? cudaFreeAsync(devPtr, stream) : cudaFree(devPtr); } -#define RETURN_IF_CUDA_ERROR \ - { \ - cudaError_t err = cudaGetLastError(); \ - if (err != cudaSuccess) { \ - printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \ - return FINUFFT_ERR_CUDA_FAILURE; \ - } \ +#define RETURN_IF_CUDA_ERROR \ + { \ + cudaError_t err = cudaGetLastError(); \ + if (err != cudaSuccess) { \ + printf("[%s] Error: %s in %s at line %d\n", __func__, cudaGetErrorString(err), \ + __FILE__, __LINE__); \ + return FINUFFT_ERR_CUDA_FAILURE; \ + } \ } #define CUDA_FREE_AND_NULL(val, stream, pool_supported) \ diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index f905c14f0..c9c5e2ca2 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -8,7 +8,9 @@ constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + for (int i = 0; i < 2; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==3) { constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; @@ -17,7 +19,9 @@ constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); +for (int i=0; i<3; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); +} } else if (w==4) { constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; @@ -27,7 +31,9 @@ constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + for (int i=0; i<4; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==5) { constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; @@ -38,7 +44,9 @@ constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + for (int i=0; i<5; i++) { + ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==6) { constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; @@ -50,7 +58,9 @@ constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + for (int i=0; i<6; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==7) { constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; @@ -63,7 +73,9 @@ constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + for (int i=0; i<7; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==8) { constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; @@ -76,7 +88,9 @@ constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + for (int i = 0; i < 8; i++) { + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + } } else if (w==9) { constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index d2f1ecd2d..7fd098925 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -9,10 +9,39 @@ namespace cufinufft { namespace spreadinterp { -template static __forceinline__ __device__ T fold_rescale(T x, int N) { - static constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - const T result = x * x2pi + T(0.5); - return (result - floor(result)) * T(N); +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { + constexpr const auto x2pi = T(0.159154943091895345554011992339482617); + constexpr const auto half = T(0.5); +#if defined(__CUDA_ARCH__) + if constexpr (std::is_same_v) { + auto result = __fmaf_rn(x, x2pi, half); + result = __fsub_rd(result, truncf(result)); + return __fmul_rd(result, static_cast(N)); + } else if constexpr (std::is_same_v) { + auto result = __fma_rn(x, x2pi, half); + result = __dsub_rd(result, trunc(result)); + return __dmul_rd(result, static_cast(N)); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } +#else + const auto result = std::fma(x, x2pi, half); + return (result - std::trunc(result)) * static_cast(N); +#endif +} + +template +static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { + if constexpr (std::is_same_v) { + return __fmaf_rn(a, b, c); + } else if constexpr (std::is_same_v) { + return __fma_rn(a, b, c); + } else { + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + } } template @@ -23,11 +52,11 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) approximation to prolate spheroidal wavefunction (PSWF) of order 0. This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - if (abs(x) >= opts.ES_halfwidth) + if (abs(x) >= T(opts.ES_halfwidth)) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp(opts.ES_beta * sqrt(1.0 - opts.ES_c * x * x)); + return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x)); } template @@ -53,7 +82,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { - T z = 2 * x + w - 1.0; // scale so local grid offset z in [-1,1] +#ifdef __CUDA_ARCH__ + __builtin_assume(w >= 2); + if constexpr (std::is_same_v) { + __builtin_assume(w <= 7); + } + if constexpr (std::is_same_v) { + __builtin_assume(w <= 16); + } +#endif + const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] + // T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here using FLT = T; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index bb288af0b..dbcaed87f 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -41,10 +41,12 @@ def build_args(args): "--n_runs": "10", "--method": "0", "--sort": "1", - # "--N1": "16777216", - "--N1": "256", - "--N2": "256", + "--N1": "16777216", + # "--N2": "256", + # "--N1": "256", + # "--N2": "256", # "--N3": "256", + "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} # iterate over tol from 1E-6 to 1E-1 @@ -135,21 +137,21 @@ def build_args(args): pivot_df = df.pivot(index='tolerance', columns='method') # print(pivot_df) # scale the throughput SM by GM -pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] # scale setpts SM by GM -pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] # remove the GM column -pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) print(pivot_df) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value -min_val = min(pivot_df[('throughput', 'SM')].min(), 1) -max_val = max(pivot_df[('throughput', 'SM')].max(), 0) +min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) +max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) print(min_val, max_val) plt.ylim(min_val * .99, max_val * 1.01) # plt.ylim(.8, 1.2) diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 36fa2bef9..e958bfea3 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -16,6 +16,7 @@ using namespace cufinufft::common; using namespace cufinufft::memtransfer; #include "spreadinterp1d.cuh" +#include namespace cufinufft { namespace spreadinterp { @@ -51,10 +52,30 @@ int cuspread1d(cufinufft_plan_t *d_plan, int blksize) return ier; } +template struct cmp : public thrust::binary_function { + + cmp(const T *kx) : kx(kx) {} + + __host__ __device__ bool operator()(const int a, const int b) const { + return fold_rescale(kx[a], 1) < fold_rescale(kx[b], 1); + } + +private: + const T *kx; +}; + template int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { auto &stream = d_plan->stream; - + if (d_plan->opts.gpu_sort && d_plan->opts.gpu_method == 1) { + int *d_idxnupts = d_plan->idxnupts; + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); + RETURN_IF_CUDA_ERROR + thrust::sort(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M, + cmp{d_plan->kx}); + RETURN_IF_CUDA_ERROR + return 0; + } if (d_plan->opts.gpu_sort) { int bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { @@ -84,17 +105,16 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t *d_plan) { thrust::device_ptr d_ptr(d_binsize); thrust::device_ptr d_result(d_binstartpts); thrust::exclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, bin_size_x, numbins, d_binstartpts, d_sortidx, d_kx, d_idxnupts, nf1); RETURN_IF_CUDA_ERROR } else { int *d_idxnupts = d_plan->idxnupts; - trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>(M, - d_idxnupts); + thrust::sequence(thrust::cuda::par.on(stream), d_idxnupts, d_idxnupts + M); RETURN_IF_CUDA_ERROR } - return 0; } @@ -134,7 +154,6 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t *d_plan, int blks RETURN_IF_CUDA_ERROR } } - return 0; } @@ -146,33 +165,29 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) which only needs to be done once. */ { - auto &stream = d_plan->stream; - int ier; - int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; - int bin_size_x = d_plan->opts.gpu_binsizex; + const auto maxsubprobsize = d_plan->opts.gpu_maxsubprobsize; + const auto bin_size_x = d_plan->opts.gpu_binsizex; if (bin_size_x < 0) { std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n"; return FINUFFT_ERR_BINSIZE_NOTVALID; } - int numbins = ceil((T)nf1 / bin_size_x); - - T *d_kx = d_plan->kx; - - int *d_binsize = d_plan->binsize; - int *d_binstartpts = d_plan->binstartpts; - int *d_sortidx = d_plan->sortidx; - int *d_numsubprob = d_plan->numsubprob; - int *d_subprobstartpts = d_plan->subprobstartpts; - int *d_idxnupts = d_plan->idxnupts; + const auto numbins = (nf1 + bin_size_x - 1) / bin_size_x; + const auto d_kx = d_plan->kx; + const auto d_binsize = d_plan->binsize; + const auto d_binstartpts = d_plan->binstartpts; + const auto d_sortidx = d_plan->sortidx; + const auto d_numsubprob = d_plan->numsubprob; + const auto d_subprobstartpts = d_plan->subprobstartpts; + const auto d_idxnupts = d_plan->idxnupts; + const auto stream = d_plan->stream; int *d_subprob_to_bin = nullptr; - if ((ier = - checkCudaErrors(cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_binsize, 0, numbins * sizeof(int), stream); + RETURN_IF_CUDA_ERROR calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024, 0, stream>>>( M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx); RETURN_IF_CUDA_ERROR @@ -193,30 +208,25 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t *d_plan) d_ptr = thrust::device_pointer_cast(d_numsubprob); d_result = thrust::device_pointer_cast(d_subprobstartpts + 1); thrust::inclusive_scan(thrust::cuda::par.on(stream), d_ptr, d_ptr + n, d_result); + RETURN_IF_CUDA_ERROR - if ((ier = checkCudaErrors(cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream)))) - return ier; + cudaMemsetAsync(d_subprobstartpts, 0, sizeof(int), stream); + RETURN_IF_CUDA_ERROR - int totalnumsubprob; - if ((ier = - checkCudaErrors(cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], - sizeof(int), cudaMemcpyDeviceToHost, stream)))) - return ier; + int totalnumsubprob{}; + cudaMemcpyAsync(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), + cudaMemcpyDeviceToHost, stream); cudaStreamSynchronize(stream); - if ((ier = checkCudaErrors( - cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, - d_plan->supports_pools)))) - return ier; + RETURN_IF_CUDA_ERROR + + cudaMallocWrapper(&d_subprob_to_bin, totalnumsubprob * sizeof(int), stream, + d_plan->supports_pools); + RETURN_IF_CUDA_ERROR + map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024, 0, stream>>>( d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins); - cudaError_t err = cudaGetLastError(); - if (err != cudaSuccess) { - fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err)); - cudaFree(d_subprob_to_bin); - return FINUFFT_ERR_CUDA_FAILURE; - } - - assert(d_subprob_to_bin != NULL); + RETURN_IF_CUDA_ERROR + assert(d_subprob_to_bin != nullptr); cudaFreeWrapper(d_plan->subprob_to_bin, stream, d_plan->supports_pools); d_plan->subprob_to_bin = d_subprob_to_bin; d_plan->totalnumsubprob = totalnumsubprob; diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 9a536ec9c..68656c124 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -10,6 +10,8 @@ #include #include +#include + using namespace cufinufft::utils; namespace cufinufft { @@ -21,26 +23,33 @@ template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xx, ix; - T ker1[MAX_NSPREAD]; - T x_rescaled; - cuda_complex cnow; + auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - cnow = c[idxnupts[i]]; - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { + if constexpr (std::is_same_v) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); + return int2{xstart, xend}; + } + if constexpr (std::is_same_v) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); + return int2{xstart, xend}; + } + }(); + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + for (auto xx = xstart; xx <= xend; xx++) { + auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); T kervalue = ker1[xx - xstart]; atomicAdd(&fw[ix].x, cnow.x * kervalue); atomicAdd(&fw[ix].y, cnow.y * kervalue); @@ -87,16 +96,21 @@ __global__ void calc_inverse_of_global_sort_idx_1d( } } +template +__forceinline__ __device__ cuda_complex mul(const cuda_complex &a, const T b) { + return {a.x * b, a.y * b}; +} + template __global__ void spread_1d_subprob( const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, const int *idxnupts) { + const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { extern __shared__ char sharedbuf[]; - auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + alignas(256) auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; - int xstart, xend, ix; + int ix; const int subpidx = blockIdx.x; const int bidx = subprob_to_bin[subpidx]; const int binsubp_idx = subpidx - subprobstartpts[bidx]; @@ -106,11 +120,11 @@ __global__ void spread_1d_subprob( const auto ns_2 = (ns + 1) / 2; const int N = bin_size_x + 2 * ns_2; - T ker1[MAX_NSPREAD]; + // dynamic stack allocation + auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = T(0); - fwshared[i].y = T(0); + fwshared[i] = {0, 0}; } __syncthreads(); @@ -119,8 +133,18 @@ __global__ void spread_1d_subprob( const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); const auto cnow = c[idxnupts[idx]]; - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; + const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { + if constexpr (std::is_same_v) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); + return int2{xstart, xend}; + } + if constexpr (std::is_same_v) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); + return int2{xstart, xend}; + } + }(); const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) @@ -130,8 +154,9 @@ __global__ void spread_1d_subprob( for (int xx = xstart; xx <= xend; xx++) { ix = xx + ns_2; if (ix >= (bin_size_x + ns_2) || ix < 0) break; - atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]); - atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]); + const auto result = mul(cnow, ker1[xx - xstart]); + atomicAdd(&fwshared[ix].x, result.x); + atomicAdd(&fwshared[ix].y, result.y); } } __syncthreads(); diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index d2928858b..d8b192e8b 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -13,15 +13,28 @@ set(PRECISION_DEPENDENT_SRC memtransfer_wrapper.cu deconvolve_wrapper.cu cufinufft.cu common.cu ) +set(HELPER_MATH_URL "https://raw.githubusercontent.com/NVIDIA/cuda-samples/master/Common/helper_math.h") +set(HELPER_MATH_FILE "${CMAKE_BINARY_DIR}/helper_math.h") +if(NOT EXISTS ${HELPER_MATH_FILE}) + file(DOWNLOAD ${HELPER_MATH_URL} ${HELPER_MATH_FILE}) +endif() + set(CUFINUFFT_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/contrib + ${CMAKE_BINARY_DIR} $ $ $ ) set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) +# flush denormals to zero and enable verbose PTXAS output +set(FINUFFT_CUDA_FLAGS + -ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage + -Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr +) + add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_target_properties( @@ -30,6 +43,8 @@ set_target_properties( CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) +target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) + add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) @@ -38,6 +53,7 @@ set_target_properties( POSITION_INDEPENDENT_CODE ON CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} ) +target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) add_library(cufinufft SHARED $ From 35dcc666197a0cfb3d4ab29b3b728b86b057050e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 15 Jul 2024 16:17:20 -0400 Subject: [PATCH 10/39] Optimized 1D and 2D --- .../contrib/ker_horner_allw_loop.inc | 6 +- include/cufinufft/utils.h | 13 + perftest/cuda/bench.py | 23 +- src/cuda/1d/spread1d_wrapper.cu | 1 + src/cuda/1d/spreadinterp1d.cuh | 43 +- src/cuda/2d/interp2d_wrapper.cu | 16 +- src/cuda/2d/spread2d_wrapper.cu | 15 +- src/cuda/2d/spreadinterp2d.cuh | 568 +++++++++--------- src/cuda/common.cu | 87 ++- 9 files changed, 407 insertions(+), 365 deletions(-) diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index c9c5e2ca2..1178a8544 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -20,7 +20,7 @@ constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; for (int i=0; i<3; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==4) { constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; @@ -32,7 +32,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; for (int i=0; i<4; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==5) { constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; @@ -45,7 +45,7 @@ for (int i=0; i<3; i++) { constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; for (int i=0; i<5; i++) { - ker[i] = fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, fmaf(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); + ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); } } else if (w==6) { constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 3455b99c0..b0a77aec7 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -68,6 +68,19 @@ template T infnorm(int n, std::complex *a) { } return sqrt(nrm); } + +#ifdef __CUDA_ARCH__ +__forceinline__ __device__ auto interval(const int ns, const float x) { + const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + return int2{xstart, xend}; +} +__forceinline__ __device__ auto interval(const int ns, const double x) { + const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); + const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); + return int2{xstart, xend}; +} +#endif } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index dbcaed87f..db7e73873 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,14 +37,13 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", - "--n_runs": "10", +args = {"--prec": "d", + "--n_runs": "5", "--method": "0", "--sort": "1", - "--N1": "16777216", - # "--N2": "256", - # "--N1": "256", - # "--N2": "256", + # "--N1": "16777216", + "--N1": "256", + "--N2": "256", # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", @@ -93,6 +92,10 @@ def build_args(args): conf = [x for x in stdout.splitlines() if x.startswith("#")] print('\n'.join(conf)) stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + stdout = '\n'.join(stdout) # convert stdout to a dataframe from csv string dt = pd.read_csv(io.StringIO(stdout), sep=',') @@ -153,7 +156,7 @@ def build_args(args): min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) print(min_val, max_val) -plt.ylim(min_val * .99, max_val * 1.01) +plt.ylim(min_val * .90, max_val * 1.1) # plt.ylim(.8, 1.2) # Calculate the smallest power of 10 @@ -163,15 +166,15 @@ def build_args(args): # plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin # plot an horizontal line at 1 with label "GM" -plt.axhline(y=1, color='k', linestyle='--', label='GM') +# plt.axhline(y=1, color='k', linestyle='--', label='GM') plt.xlabel('Tolerance') -plt.ylabel('Throughput (% of GM)') +plt.ylabel('Throughput') plt.title('Throughput by Tolerance and Method') plt.legend(title='Method') plt.tight_layout() plt.show() plt.xlabel("Tolerance") -plt.ylabel("Points/s (% of GM)") +plt.ylabel("Points/s") plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index e958bfea3..4e7f4ea0b 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -268,6 +268,7 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 68656c124..f94ffd7eb 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -23,26 +23,15 @@ template __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { - + // dynamic stack allocation to reduce stack usage auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); const auto cnow = c[idxnupts[i]]; - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - const T x1 = (T)xstart - x_rescaled; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else @@ -126,27 +115,17 @@ __global__ void spread_1d_subprob( for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; } + + const T ns_2f = ns * T(.5); + __syncthreads(); for (auto i = threadIdx.x; i < nupts; i += blockDim.x) { - const auto idx = ptstart + i; - const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - const auto cnow = c[idxnupts[idx]]; - - const auto [xstart, xend] = [ns, x_rescaled]() constexpr noexcept { - if constexpr (std::is_same_v) { - const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x_rescaled)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x_rescaled)); - return int2{xstart, xend}; - } - if constexpr (std::is_same_v) { - const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x_rescaled)); - const auto xend = __double2int_rd(__fma_rd(ns, .5, x_rescaled)); - return int2{xstart, xend}; - } - }(); - - const T x1 = T(xstart + xoffset) - x_rescaled; + const auto idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto cnow = c[idxnupts[idx]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const T x1 = T(xstart + xoffset) - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index 533788482..eda0d579b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -4,10 +4,12 @@ #include #include +#include #include #include using namespace cufinufft::memtransfer; +using namespace cufinufft::common; #include "spreadinterp2d.cuh" @@ -120,17 +122,14 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->opts.upsampfac; + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -140,6 +139,7 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(interp_2d_subprob, 2, *d_plan); interp_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 69b2ba956..d361791b0 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -273,16 +274,14 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, T sigma = d_plan->opts.upsampfac; - size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * - (bin_size_y + 2 * (int)ceil(ns / 2.0)) * - sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + const auto sharedplanorysize = + shared_memory_required(2, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, @@ -292,6 +291,8 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 558984ea1..62a430ca5 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -15,314 +15,314 @@ namespace spreadinterp { /* ------------------------ 2d Spreading Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - int xstart, ystart, xend, yend; - int xx, yy, ix, iy; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - T kervalue1, kervalue2; - cuda_complex cnow; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - cnow = c[idxnupts[i]]; - - xstart = ceil(x_rescaled - ns / 2.0); - ystart = ceil(y_rescaled - ns / 2.0); - xend = floor(x_rescaled + ns / 2.0); - yend = floor(y_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (yy = ystart; yy <= yend; yy++) { - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - outidx = ix + iy * nf1; - kervalue1 = ker1[xx - xstart]; - kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_nupts_driven( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto cnow = c[idxnupts[i]]; + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + const auto x1 = (T)xstart - x_rescaled; + const auto y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + for (auto yy = ystart; yy <= yend; yy++) { + for (auto xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto outidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + const auto kervalue2 = ker2[yy - ystart]; + atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); + atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); + } + } + } } /* Kernels for SubProb Method */ // SubProb properties -template -__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, int bin_size_y, int nbinx, int nbiny, +template +__global__ void calc_bin_size_noghost_2d(int M, int nf1, int nf2, int bin_size_x, + int bin_size_y, int nbinx, int nbiny, int *bin_size, T *x, T *y, int *sortidx) { - int binidx, binx, biny; - int oldidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - if (binx >= nbinx || biny >= nbiny) { - sortidx[i] = -biny; - } + int binidx, binx, biny; + int oldidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + if (binx >= nbinx || biny >= nbiny) { + sortidx[i] = -biny; } + } } -template -__global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, - const int *bin_startpts, const int *sortidx, const T *x, - const T *y, int *index, int nf1, int nf2) { - int binx, biny; - int binidx; - T x_rescaled, y_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binidx = binx + biny * nbinx; - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_2d( + int M, int bin_size_x, int bin_size_y, int nbinx, int nbiny, const int *bin_startpts, + const int *sortidx, const T *x, const T *y, int *index, int nf1, int nf2) { + int binx, biny; + int binidx; + T x_rescaled, y_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binidx = binx + biny * nbinx; + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; - } - __syncthreads(); - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow = c[idxnupts[idx]]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - T kervalue2 = ker2[yy - ystart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2); - } - } +template +__global__ void spread_2d_subprob( + const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + const int subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto cnow = c[idxnupts[idx]]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } - __syncthreads(); - /* write to global memory */ - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + for (int yy = ystart; yy <= yend; yy++) { + const auto iy = yy + ns_2; + if (iy >= (bin_size_y + rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; + const auto resx = cnow.x * kervalue; + const auto resy = cnow.y * kervalue; + atomicAdd(&fwshared[outidx].x, resx); + atomicAdd(&fwshared[outidx].y, resy); + } + } + } + + __syncthreads(); + /* write to global memory */ + for (int k = threadIdx.x; k < N; k += blockDim.x) { + const auto i = k % (bin_size_x + rounded_ns); + const auto j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); + atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); } + } } /* --------------------- 2d Interpolation Kernels ----------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, - int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - int inidx = ix + iy * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2; - cnow.y += fw[inidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; +template +__global__ void interp_2d_nupts_driven( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + + T x1 = (T)xstart - x_rescaled; + T y1 = (T)ystart - y_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); } + + cuda_complex cnow{0, 0}; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const auto iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const auto inidx = ix + iy * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2; + cnow.y += fw[inidx].y * kervalue1 * kervalue2; + } + } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } /* Kernels for Subprob Method */ -template -__global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, - int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, - int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = (bidx / nbinx) * bin_size_y; - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)); - - for (int k = threadIdx.x; k < N; k += blockDim.x) { - int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = k / (bin_size_x + 2 * ceil(ns / 2.0)); - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - outidx = ix + iy * nf1; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_2d_subprob( + const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, + int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size, + int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = (bidx / nbinx) * bin_size_y; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + + for (int k = threadIdx.x; k < N; k += blockDim.x) { + int i = k % (bin_size_x + rounded_ns); + int j = k / (bin_size_x + rounded_ns); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + int(iy * nf1); + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx].x = fw[outidx].x; + fwshared[sharedidx].y = fw[outidx].y; } - __syncthreads(); - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - - T x_rescaled, y_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - - T x1 = (T)xstart + xoffset - x_rescaled; - T y1 = (T)ystart + yoffset - y_rescaled; - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - } - - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - iy = yy + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; - } - } - c[idxnupts[idx]] = cnow; + } + __syncthreads(); + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + xend -= xoffset; + yend -= yoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + } + + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto iy = yy + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2; + } } + c[idxnupts[idx]] = cnow; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index f5661b1dd..e7ce65b52 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -221,18 +221,68 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } +// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize +template int find_bin_size(std::size_t MemSize, int dim, int ns) { + int binsize = 1; // Start with the smallest possible bin size + + while (true) { + // Calculate the shared memory required for the current bin_size_x and bin_size_y + std::size_t required_memory = + shared_memory_required(dim, ns, binsize, binsize, binsize); + + // Check if the required memory is less than the available memory + if (required_memory > MemSize) { + // If the condition is met, return the current bin_size_x + return binsize - 1; + } + + // Increment bin_size_x for the next iteration + binsize++; + } +} + template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { - switch (opts->gpu_method) { - case 1: - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 1024 : opts->gpu_binsizex; - break; - case 0: - case 2: - if (opts->gpu_binsizex < 0) { + if (opts->gpu_binsizex < 0) { + cudaGetDevice(&device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + if (const auto err = cudaGetLastError(); err != cudaSuccess) { + throw std::runtime_error(cudaGetErrorString(err)); + } + const int bin_size = + shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; + // find the power of 2 that is less than bin_size + // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + const auto shared_mem_required = shared_memory_required( + dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, + // shared_mem_required); + } + opts->gpu_binsizey = 1; + opts->gpu_binsizez = 1; + } break; + case 2: { + if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0) { + switch (opts->gpu_method) { + case 0: + case 2: { + opts->gpu_binsizex = 32; + opts->gpu_binsizey = 32; + // fall through otherwise + if (opts->gpu_method && ns > 2) { + break; + } + } + case 1: { cudaGetDevice(&device_id); if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); @@ -242,22 +292,17 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } - const int bin_size = - shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - const int exponent = std::log2(bin_size); - opts->gpu_binsizex = 1 << (exponent - 1); - // printf("bin_size: %d, gpu_binsizex: %d\n", bin_size, - // opts->gpu_binsizex); + + const auto binsize = find_bin_size(shared_mem_per_block, dim, ns); + opts->gpu_binsizex = binsize; + opts->gpu_binsizey = binsize; + } break; } - break; } - opts->gpu_binsizey = 1; - opts->gpu_binsizez = 1; - } break; - case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 32 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 32 : opts->gpu_binsizey; + // const auto shared_mem_required = shared_memory_required( + // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); + // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", + // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: { From 366295d41c54837250d728da6b1ef590002d1a40 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 15:18:27 -0400 Subject: [PATCH 11/39] 3D integer operations --- perftest/cuda/bench.py | 6 +- src/cuda/1d/cufinufft1d.cu | 3 - src/cuda/1d/interp1d_wrapper.cu | 4 - src/cuda/1d/spread1d_wrapper.cu | 1 - src/cuda/1d/spreadinterp1d.cuh | 1 - src/cuda/2d/cufinufft2d.cu | 6 +- src/cuda/2d/interp2d_wrapper.cu | 3 - src/cuda/2d/spread2d_wrapper.cu | 3 - src/cuda/3d/cufinufft3d.cu | 3 - src/cuda/3d/interp3d_wrapper.cu | 24 +- src/cuda/3d/spread3d_wrapper.cu | 15 +- src/cuda/3d/spreadinterp3d.cuh | 1010 ++++++++++++++++--------------- src/cuda/common.cu | 15 +- 13 files changed, 549 insertions(+), 545 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index db7e73873..8a9e757a3 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,14 +37,14 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "d", +args = {"--prec": "f", "--n_runs": "5", "--method": "0", "--sort": "1", # "--N1": "16777216", "--N1": "256", "--N2": "256", - # "--N3": "256", + "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} @@ -82,6 +82,8 @@ def build_args(args): data['method'].append('GM') elif method == '2': data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') print("Method " + data['method'][-1]) cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) stdout, stderr = run_command("nsys", cmd) diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu index 4ecb3b283..a17b6f044 100644 --- a/src/cuda/1d/cufinufft1d.cu +++ b/src/cuda/1d/cufinufft1d.cu @@ -1,9 +1,6 @@ #include #include #include -#include -#include -#include #include #include diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu index cd3637c8b..2bf69f6a2 100644 --- a/src/cuda/1d/interp1d_wrapper.cu +++ b/src/cuda/1d/interp1d_wrapper.cu @@ -1,14 +1,10 @@ #include #include -#include #include -#include #include #include -using namespace cufinufft::memtransfer; - #include "spreadinterp1d.cuh" namespace cufinufft { diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 4e7f4ea0b..824da42c9 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -1,6 +1,5 @@ #include #include -#include #include #include diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index f94ffd7eb..b6c511555 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -5,7 +5,6 @@ #include #include -#include #include #include #include diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu index afc801b7f..f7f7b1559 100644 --- a/src/cuda/2d/cufinufft2d.cu +++ b/src/cuda/2d/cufinufft2d.cu @@ -1,14 +1,10 @@ -#include +#include #include #include -#include -#include - #include #include #include -#include #include using namespace cufinufft::deconvolve; diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu index eda0d579b..0d3d3ff9b 100644 --- a/src/cuda/2d/interp2d_wrapper.cu +++ b/src/cuda/2d/interp2d_wrapper.cu @@ -1,14 +1,11 @@ -#include #include #include #include #include -#include #include -using namespace cufinufft::memtransfer; using namespace cufinufft::common; #include "spreadinterp2d.cuh" diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index d361791b0..244d25b03 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -8,14 +7,12 @@ #include #include -#include #include #include #include "spreadinterp2d.cuh" using namespace cufinufft::common; -using namespace cufinufft::memtransfer; namespace cufinufft { namespace spreadinterp { diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu index ea0ef4a86..5977e6d5f 100644 --- a/src/cuda/3d/cufinufft3d.cu +++ b/src/cuda/3d/cufinufft3d.cu @@ -1,13 +1,10 @@ #include #include -#include -#include #include #include #include -#include #include #include diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu index b42231d86..91379d3ae 100644 --- a/src/cuda/3d/interp3d_wrapper.cu +++ b/src/cuda/3d/interp3d_wrapper.cu @@ -1,15 +1,15 @@ -#include #include #include #include +#include "spreadinterp3d.cuh" +#include #include #include -#include "spreadinterp3d.cuh" - using namespace cufinufft::memtransfer; +using namespace cufinufft::common; namespace cufinufft { namespace spreadinterp { @@ -123,19 +123,16 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int *d_subprob_to_bin = d_plan->subprob_to_bin; int totalnumsubprob = d_plan->totalnumsubprob; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); - if (sharedplanorysize > 49152) { - std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n"; - return FINUFFT_ERR_INSUFFICIENT_SHMEM; - } + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth == 1) { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, @@ -143,6 +140,7 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { + cufinufft_set_shared_memory(interp_3d_subprob, 3, *d_plan); interp_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma, d_binstartpts, d_binsize, bin_size_x, bin_size_y, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index 6c851389c..bf78ed905 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -1,5 +1,4 @@ #include -#include #include #include @@ -8,12 +7,10 @@ #include #include -#include #include #include using namespace cufinufft::common; -using namespace cufinufft::memtransfer; #include "spreadinterp3d.cuh" @@ -532,12 +529,12 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ int totalnumsubprob = d_plan->totalnumsubprob; int *d_subprob_to_bin = d_plan->subprob_to_bin; - T sigma = d_plan->spopts.upsampfac; - T es_c = d_plan->spopts.ES_c; - T es_beta = d_plan->spopts.ES_beta; - size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * - (bin_size_y + 2 * ceil(ns / 2.0)) * - (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex); + T sigma = d_plan->spopts.upsampfac; + T es_c = d_plan->spopts.ES_c; + T es_beta = d_plan->spopts.ES_beta; + const auto sharedplanorysize = + shared_memory_required(3, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 838816a56..dc722ddc3 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -10,548 +10,568 @@ #include #include +using namespace cufinufft::utils; + namespace cufinufft { namespace spreadinterp { /* ---------------------- 3d Spreading Kernels -------------------------------*/ /* Kernels for bin sort NUpts */ -template -__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, int bin_size_y, - int bin_size_z, int nbinx, int nbiny, int nbinz, int *bin_size, const T *x, +template +__global__ void calc_bin_size_noghost_3d(int M, int nf1, int nf2, int nf3, int bin_size_x, + int bin_size_y, int bin_size_z, int nbinx, + int nbiny, int nbinz, int *bin_size, const T *x, const T *y, const T *z, int *sortidx) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = binx + biny * nbinx + binz * nbinx * nbiny; - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = binx + biny * nbinx + binz * nbinx * nbiny; + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, - int nbiny, int nbinz, const int *bin_startpts, const int *sortidx, - const T *x, const T *y, const T *z, int *index, - int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - binx = binx >= nbinx ? binx - 1 : binx; - binx = binx < 0 ? 0 : binx; - biny = floor(y_rescaled / bin_size_y); - biny = biny >= nbiny ? biny - 1 : biny; - biny = biny < 0 ? 0 : biny; - binz = floor(z_rescaled / bin_size_z); - binz = binz >= nbinz ? binz - 1 : binz; - binz = binz < 0 ? 0 : binz; - binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_3d( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nbinx, int nbiny, + int nbinz, const int *bin_startpts, const int *sortidx, const T *x, const T *y, + const T *z, int *index, int nf1, int nf2, int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + binx = binx >= nbinx ? binx - 1 : binx; + binx = binx < 0 ? 0 : binx; + biny = floor(y_rescaled / bin_size_y); + biny = biny >= nbiny ? biny - 1 : biny; + biny = biny < 0 ? 0 : biny; + binz = floor(z_rescaled / bin_size_z); + binz = binz >= nbinz ? binz - 1 : binz; + binz = binz < 0 ? 0 : binz; + binidx = common::calc_global_index_v2(binx, biny, binz, nbinx, nbiny, nbinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } /* Kernels for NUptsdriven method */ -template -__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *idxnupts) { - int xx, yy, zz, ix, iy, iz; - int outidx; - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - T ker1val, ker2val, ker3val; - - T x_rescaled, y_rescaled, z_rescaled; - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - T x1 = (T)xstart - x_rescaled; - T y1 = (T)ystart - y_rescaled; - T z1 = (T)zstart - z_rescaled; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, x1, ns, sigma); - eval_kernel_vec_horner(ker2, y1, ns, sigma); - eval_kernel_vec_horner(ker3, z1, ns, sigma); - } else { - eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - eval_kernel_vec(ker2, y1, ns, es_c, es_beta); - eval_kernel_vec(ker3, z1, ns, es_c, es_beta); - } +template +__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, + const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, + T es_beta, T sigma, const int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (zz = zstart; zz <= zend; zz++) { - ker3val = ker3[zz - zstart]; - for (yy = ystart; yy <= yend; yy++) { - ker2val = ker2[yy - ystart]; - for (xx = xstart; xx <= xend; xx++) { - ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - ker1val = ker1[xx - xstart]; - T kervalue = ker1val * ker2val * ker3val; - atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); - atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto ker3val = ker3[zz - zstart]; + for (int yy = ystart; yy <= yend; yy++) { + const auto ker2val = ker2[yy - ystart]; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + const int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const auto ker1val = ker1[xx - xstart]; + const auto kervalue = ker1val * ker2val * ker3val; + atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); + atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); } + } } + } } /* Kernels for Subprob method */ -template -__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, - int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, - int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, - int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, - int nbinz, int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - const int bidx = subprob_to_bin[blockIdx.x]; - const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; - const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - const int xoffset = (bidx % nbinx) * bin_size_x; - const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_subprob( + T *x, T *y, T *z, cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, + int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size, + int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin, + int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, + int nbinz, int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + + const int bidx = subprob_to_bin[blockIdx.x]; + const int binsubp_idx = blockIdx.x - subprobstartpts[bidx]; + const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const int xoffset = (bidx % nbinx) * bin_size_x; + const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i] = {0, 0}; + } + __syncthreads(); + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int nuptsidx = idxnupts[ptstart + i]; + const auto x_rescaled = fold_rescale(x[nuptsidx], nf1); + const auto y_rescaled = fold_rescale(y[nuptsidx], nf2); + const auto z_rescaled = fold_rescale(z[nuptsidx], nf3); + const auto cnow = c[nuptsidx]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - const int nuptsidx = idxnupts[ptstart + i]; - const T x_rescaled = fold_rescale(x[nuptsidx], nf1); - const T y_rescaled = fold_rescale(y[nuptsidx], nf2); - const T z_rescaled = fold_rescale(z[nuptsidx], nf3); - cuda_complex cnow = c[nuptsidx]; - - const int xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - const int ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - const int zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - const int xend = floor(x_rescaled + ns / 2.0) - xoffset; - const int yend = floor(y_rescaled + ns / 2.0) - yoffset; - const int zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - const T kervalue3 = ker3[zz - zstart]; - const int iz = zz + ceil(ns / 2.0); - if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0) - break; - for (int yy = ystart; yy <= yend; yy++) { - const T kervalue2 = ker2[yy - ystart]; - const int iy = yy + ceil(ns / 2.0); - if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0) - break; - for (int xx = xstart; xx <= xend; xx++) { - const int ix = xx + ceil(ns / 2.0); - if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0) - break; - const int outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - const T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + for (int zz = zstart; zz <= zend; zz++) { + const T kervalue3 = ker3[zz - zstart]; + const int iz = zz + ns_2; + if (iz >= (bin_size_z + (int)rounded_ns) || iz < 0) break; + for (int yy = ystart; yy <= yend; yy++) { + const T kervalue2 = ker2[yy - ystart]; + const int iy = yy + ns_2; + if (iy >= (bin_size_y + (int)rounded_ns) || iy < 0) break; + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx + ns_2; + if (ix >= (bin_size_x + (int)rounded_ns) || ix < 0) break; + const int outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3; + const auto resx = cnow.x * kervalue; + const auto resy = cnow.y * kervalue; + atomicAdd(&fwshared[outidx].x, resx); + atomicAdd(&fwshared[outidx].y, resy); } + } } - __syncthreads(); - - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - const int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - const int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - const int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - int ix = xoffset - ceil(ns / 2.0) + i; - int iy = yoffset - ceil(ns / 2.0) + j; - int iz = zoffset - ceil(ns / 2.0) + k; - - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - const int outidx = ix + iy * nf1 + iz * nf1 * nf2; - const int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); - } + } + __syncthreads(); + + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + const int i = n % (bin_size_x + rounded_ns); + const int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + const int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + + int ix = xoffset - ns_2 + i; + int iy = yoffset - ns_2 + j; + int iz = zoffset - ns_2 + k; + + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const int outidx = ix + iy * nf1 + iz * nf1 * nf2; + const int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); + atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); } + } } /* Kernels for BlockGather Method */ -template -__global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, - int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, - int *bin_size, const T *x, const T *y, const T *z, int *sortidx, - int nf1, int nf2, int nf3) { - int binidx, binx, biny, binz; - int oldidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - oldidx = atomicAdd(&bin_size[binidx], 1); - sortidx[i] = oldidx; - } +template +__global__ void locate_nupts_to_bins_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_size, + const T *x, const T *y, const T *z, int *sortidx, int nf1, int nf2, int nf3) { + int binidx, binx, biny, binz; + int oldidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + oldidx = atomicAdd(&bin_size[binidx], 1); + sortidx[i] = oldidx; + } } -template -__global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, - int nobinx, int nobiny, int nobinz, int binsperobinx, - int binsperobiny, int binsperobinz, int *bin_startpts, - const int *sortidx, const T *x, const T *y, const T *z, - int *index, int nf1, int nf2, int nf3) { - int binx, biny, binz; - int binidx; - T x_rescaled, y_rescaled, z_rescaled; - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; i += gridDim.x * blockDim.x) { - x_rescaled = fold_rescale(x[i], nf1); - y_rescaled = fold_rescale(y[i], nf2); - z_rescaled = fold_rescale(z[i], nf3); - binx = floor(x_rescaled / bin_size_x); - biny = floor(y_rescaled / bin_size_y); - binz = floor(z_rescaled / bin_size_z); - binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); - biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); - binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); - - binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, binsperobinx, binsperobiny, - binsperobinz); - - index[bin_startpts[binidx] + sortidx[i]] = i; - } +template +__global__ void calc_inverse_of_global_sort_index_ghost( + int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx, int nobiny, + int nobinz, int binsperobinx, int binsperobiny, int binsperobinz, int *bin_startpts, + const int *sortidx, const T *x, const T *y, const T *z, int *index, int nf1, int nf2, + int nf3) { + int binx, biny, binz; + int binidx; + T x_rescaled, y_rescaled, z_rescaled; + for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; + i += gridDim.x * blockDim.x) { + x_rescaled = fold_rescale(x[i], nf1); + y_rescaled = fold_rescale(y[i], nf2); + z_rescaled = fold_rescale(z[i], nf3); + binx = floor(x_rescaled / bin_size_x); + biny = floor(y_rescaled / bin_size_y); + binz = floor(z_rescaled / bin_size_z); + binx = binx / (binsperobinx - 2) * binsperobinx + (binx % (binsperobinx - 2) + 1); + biny = biny / (binsperobiny - 2) * binsperobiny + (biny % (binsperobiny - 2) + 1); + binz = binz / (binsperobinz - 2) * binsperobinz + (binz % (binsperobinz - 2) + 1); + + binidx = common::calc_global_index(binx, biny, binz, nobinx, nobiny, nobinz, + binsperobinx, binsperobiny, binsperobinz); + + index[bin_startpts[binidx] + sortidx[i]] = i; + } } -template -__global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex *c, - cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, - T sigma, const int *binstartpts, int obin_size_x, int obin_size_y, - int obin_size_z, int binsperobin, int *subprob_to_bin, - const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny, - int nobinz, const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, zstart, xend, yend, zend; - int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; - int subpidx = blockIdx.x; - int obidx = subprob_to_bin[subpidx]; - int bidx = obidx * binsperobin; - - int obinsubp_idx = subpidx - subprobstartpts[obidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; - int nupts = - min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - obinsubp_idx * maxsubprobsize); - - int xoffset = (obidx % nobinx) * obin_size_x; - int yoffset = (obidx / nobinx) % nobiny * obin_size_y; - int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; - - int N = obin_size_x * obin_size_y * obin_size_z; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; +template +__global__ void spread_3d_block_gather( + const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, int obin_size_x, int obin_size_y, int obin_size_z, + int binsperobin, int *subprob_to_bin, const int *subprobstartpts, int maxsubprobsize, + int nobinx, int nobiny, int nobinz, const int *idxnupts) { + extern __shared__ char sharedbuf[]; + cuda_complex *fwshared = (cuda_complex *)sharedbuf; + + int xstart, ystart, zstart, xend, yend, zend; + int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; + int subpidx = blockIdx.x; + int obidx = subprob_to_bin[subpidx]; + int bidx = obidx * binsperobin; + + int obinsubp_idx = subpidx - subprobstartpts[obidx]; + int ix, iy, iz; + int outidx; + int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; + int nupts = min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - + obinsubp_idx * maxsubprobsize); + + int xoffset = (obidx % nobinx) * obin_size_x; + int yoffset = (obidx / nobinx) % nobiny * obin_size_y; + int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; + + int N = obin_size_x * obin_size_y * obin_size_z; + + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; + + for (int i = threadIdx.x; i < N; i += blockDim.x) { + fwshared[i].x = 0.0; + fwshared[i].y = 0.0; + } + __syncthreads(); + + T x_rescaled, y_rescaled, z_rescaled; + cuda_complex cnow; + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + int nidx = idxnupts[ptstart + i]; + int b = nidx / M; + int box[3]; + for (int d = 0; d < 3; d++) { + box[d] = b % 3; + if (box[d] == 1) box[d] = -1; + if (box[d] == 2) box[d] = 1; + b = b / 3; + } + int ii = nidx % M; + x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; + y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; + z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; + cnow = c[ii]; + + xstart = ceil(x_rescaled - ns / 2.0) - xoffset; + ystart = ceil(y_rescaled - ns / 2.0) - yoffset; + zstart = ceil(z_rescaled - ns / 2.0) - zoffset; + xend = floor(x_rescaled + ns / 2.0) - xoffset; + yend = floor(y_rescaled + ns / 2.0) - yoffset; + zend = floor(z_rescaled + ns / 2.0) - zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); + eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); + eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); + } else { + eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); } - __syncthreads(); - - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int nidx = idxnupts[ptstart + i]; - int b = nidx / M; - int box[3]; - for (int d = 0; d < 3; d++) { - box[d] = b % 3; - if (box[d] == 1) - box[d] = -1; - if (box[d] == 2) - box[d] = 1; - b = b / 3; - } - int ii = nidx % M; - x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; - y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; - z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; - cnow = c[ii]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - xstartnew = xstart < 0 ? 0 : xstart; - ystartnew = ystart < 0 ? 0 : ystart; - zstartnew = zstart < 0 ? 0 : zstart; - xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; - yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; - zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; - - for (int zz = zstartnew; zz <= zendnew; zz++) { - T kervalue3 = ker3[zz - zstart]; - for (int yy = ystartnew; yy <= yendnew; yy++) { - T kervalue2 = ker2[yy - ystart]; - for (int xx = xstartnew; xx <= xendnew; xx++) { - outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; - T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); - } - } + xstartnew = xstart < 0 ? 0 : xstart; + ystartnew = ystart < 0 ? 0 : ystart; + zstartnew = zstart < 0 ? 0 : zstart; + xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; + yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; + zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; + + for (int zz = zstartnew; zz <= zendnew; zz++) { + T kervalue3 = ker3[zz - zstart]; + for (int yy = ystartnew; yy <= yendnew; yy++) { + T kervalue2 = ker2[yy - ystart]; + for (int xx = xstartnew; xx <= xendnew; xx++) { + outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; + T kervalue1 = ker1[xx - xstart]; + atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); + atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); } + } } - __syncthreads(); - /* write to global memory */ - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % obin_size_x; - int j = (n / obin_size_x) % obin_size_y; - int k = n / (obin_size_x * obin_size_y); - - ix = xoffset + i; - iy = yoffset + j; - iz = zoffset + k; - outidx = ix + iy * nf1 + iz * nf1 * nf2; - atomicAdd(&fw[outidx].x, fwshared[n].x); - atomicAdd(&fw[outidx].y, fwshared[n].y); - } + } + __syncthreads(); + /* write to global memory */ + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % obin_size_x; + int j = (n / obin_size_x) % obin_size_y; + int k = n / (obin_size_x * obin_size_y); + + ix = xoffset + i; + iy = yoffset + j; + iz = zoffset + k; + outidx = ix + iy * nf1 + iz * nf1 * nf2; + atomicAdd(&fw[outidx].x, fwshared[n].x); + atomicAdd(&fw[outidx].y, fwshared[n].y); + } } /* ---------------------- 3d Interpolation Kernels ---------------------------*/ /* Kernels for NUptsdriven Method */ -template -__global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex *c, - const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, - T es_beta, T sigma, int *idxnupts) { - for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); - T y_rescaled = fold_rescale(y[idxnupts[i]], nf2); - T z_rescaled = fold_rescale(z[idxnupts[i]], nf3); - - int xstart = ceil(x_rescaled - ns / 2.0); - int ystart = ceil(y_rescaled - ns / 2.0); - int zstart = ceil(z_rescaled - ns / 2.0); - - int xend = floor(x_rescaled + ns / 2.0); - int yend = floor(y_rescaled + ns / 2.0); - int zend = floor(z_rescaled + ns / 2.0); - - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; - - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart - z_rescaled, ns, es_c, es_beta); - } +template +__global__ void interp_3d_nupts_driven( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; + i += blockDim.x * gridDim.x) { + const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[i]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[i]], nf3); + + const auto [xstart, xend] = interval(ns, x_rescaled); + const auto [ystart, yend] = interval(ns, y_rescaled); + const auto [zstart, zend] = interval(ns, z_rescaled); + + const auto x1 = T(xstart) - x_rescaled; + const auto y1 = T(ystart) - y_rescaled; + const auto z1 = T(zstart) - z_rescaled; + + cuda_complex cnow{0, 0}; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); + } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - int inidx = ix + iy * nf1 + iz * nf2 * nf1; - T kervalue1 = ker1[xx - xstart]; - cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz); + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy); + for (int xx = xstart; xx <= xend; xx++) { + const int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const int inidx = ix + iy * nf1 + iz * nf2 * nf1; + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fw[inidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fw[inidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + } } + c[idxnupts[i]].x = cnow.x; + c[idxnupts[i]].y = cnow.y; + } } /* Kernels for SubProb Method */ -template -__global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, - int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, - const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, - int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, - const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, - const int *idxnupts) { - extern __shared__ char sharedbuf[]; - cuda_complex *fwshared = (cuda_complex *)sharedbuf; - - int xstart, ystart, xend, yend, zstart, zend; - int subpidx = blockIdx.x; - int bidx = subprob_to_bin[subpidx]; - int binsubp_idx = subpidx - subprobstartpts[bidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); - - int xoffset = (bidx % nbinx) * bin_size_x; - int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; - int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; - - int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0)); - - for (int n = threadIdx.x; n < N; n += blockDim.x) { - int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0)); - int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0)); - int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0))); - - ix = xoffset - ceil(ns / 2.0) + i; - iy = yoffset - ceil(ns / 2.0) + j; - iz = zoffset - ceil(ns / 2.0) + k; - if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); - outidx = ix + iy * nf1 + iz * nf1 * nf2; - int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) + - k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; - } +template +__global__ void interp_3d_subprob( + const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, + int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, + const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y, + int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts, + const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz, + const int *idxnupts) { + extern __shared__ char sharedbuf[]; + auto fwshared = (cuda_complex *)sharedbuf; + + auto ker = (T *)alloca(sizeof(T) * ns * 2); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; + + const auto subpidx = blockIdx.x; + const auto bidx = subprob_to_bin[subpidx]; + const auto binsubp_idx = subpidx - subprobstartpts[bidx]; + const auto ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize; + const auto nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize); + + const auto xoffset = (bidx % nbinx) * bin_size_x; + const auto yoffset = ((bidx / nbinx) % nbiny) * bin_size_y; + const auto zoffset = (bidx / (nbinx * nbiny)) * bin_size_z; + + const T ns_2f = ns * T(.5); + const auto ns_2 = (ns + 1) / 2; + const auto rounded_ns = ns_2 * 2; + + const int N = + (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns) * (bin_size_z + rounded_ns); + + for (int n = threadIdx.x; n < N; n += blockDim.x) { + int i = n % (bin_size_x + rounded_ns); + int j = (n / (bin_size_x + rounded_ns)) % (bin_size_y + rounded_ns); + int k = n / ((bin_size_x + rounded_ns) * (bin_size_y + rounded_ns)); + auto ix = xoffset - ns_2 + i; + auto iy = yoffset - ns_2 + j; + auto iz = zoffset - ns_2 + k; + if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2) && iz < (nf3 + ns_2)) { + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz); + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; + int sharedidx = i + j * (bin_size_x + rounded_ns) + + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + fwshared[sharedidx].x = fw[outidx].x; + fwshared[sharedidx].y = fw[outidx].y; + } + } + __syncthreads(); + + for (int i = threadIdx.x; i < nupts; i += blockDim.x) { + const int idx = ptstart + i; + const auto x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); + const auto y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); + const auto z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); + cuda_complex cnow{0, 0}; + + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; + + if constexpr (KEREVALMETH == 1) { + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); + } else { + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - __syncthreads(); - T ker1[MAX_NSPREAD]; - T ker2[MAX_NSPREAD]; - T ker3[MAX_NSPREAD]; - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; - for (int i = threadIdx.x; i < nupts; i += blockDim.x) { - int idx = ptstart + i; - x_rescaled = fold_rescale(x[idxnupts[idx]], nf1); - y_rescaled = fold_rescale(y[idxnupts[idx]], nf2); - z_rescaled = fold_rescale(z[idxnupts[idx]], nf3); - cnow.x = 0.0; - cnow.y = 0.0; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; - - if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); - } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); - } - for (int zz = zstart; zz <= zend; zz++) { - T kervalue3 = ker3[zz - zstart]; - iz = zz + ceil(ns / 2.0); - for (int yy = ystart; yy <= yend; yy++) { - T kervalue2 = ker2[yy - ystart]; - iy = yy + ceil(ns / 2.0); - for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ceil(ns / 2.0); - outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) + - iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2); - T kervalue1 = ker1[xx - xstart]; - cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; - cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; - } - } + for (int zz = zstart; zz <= zend; zz++) { + const auto kervalue3 = ker3[zz - zstart]; + const auto iz = zz + ns_2; + for (int yy = ystart; yy <= yend; yy++) { + const auto kervalue2 = ker2[yy - ystart]; + const auto iy = yy + ns_2; + for (int xx = xstart; xx <= xend; xx++) { + const auto ix = xx + ns_2; + const auto outidx = ix + iy * (bin_size_x + rounded_ns) + + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); + const auto kervalue1 = ker1[xx - xstart]; + cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3; + cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3; } - c[idxnupts[idx]].x = cnow.x; - c[idxnupts[idx]].y = cnow.y; + } } + c[idxnupts[idx]].x = cnow.x; + c[idxnupts[idx]].y = cnow.y; + } } } // namespace spreadinterp diff --git a/src/cuda/common.cu b/src/cuda/common.cu index e7ce65b52..1552076ee 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -310,9 +310,18 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { case 0: case 1: case 2: { - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 16 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 16 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 2 : opts->gpu_binsizez; + if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0 || opts->gpu_binsizez < 0) { + opts->gpu_binsizex = 16; + opts->gpu_binsizey = 16; + opts->gpu_binsizez = 2; + // const auto shared_mem_required = shared_memory_required( + // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, + // opts->gpu_binsizez); + // printf( + // "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld + // (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey, + // opts->gpu_binsizez, shared_mem_required); + } } break; case 4: { opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; From 24bf6beb68e88c05ea2c9fa1bbb23eb4a787fb51 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 15:43:57 -0400 Subject: [PATCH 12/39] 3D SM and GM optimized --- perftest/cuda/bench.py | 2 +- src/cuda/common.cu | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8a9e757a3..7af6b0bc1 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -74,7 +74,7 @@ def build_args(args): for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) - for method in ['2', '1']: + for method in ['4', '2']: args["--method"] = method if method == '0': data['method'].append('auto') diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 1552076ee..64c5639dc 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -277,11 +277,7 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { case 2: { opts->gpu_binsizex = 32; opts->gpu_binsizey = 32; - // fall through otherwise - if (opts->gpu_method && ns > 2) { - break; - } - } + } break; case 1: { cudaGetDevice(&device_id); if (const auto err = cudaGetLastError(); err != cudaSuccess) { From 960117a33109b60001797cf2045992a04f3a8406 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 18 Jul 2024 16:26:58 -0400 Subject: [PATCH 13/39] bump cuda version --- Jenkinsfile | 2 +- perftest/cuda/bench.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 6600c1cc3..c733a9436 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { stage('main') { agent { dockerfile { - filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64' + filename 'tools/cufinufft/docker/cuda12.0/Dockerfile-x86_64' args '--gpus 2' label 'v100' } diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 7af6b0bc1..8a9e757a3 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -74,7 +74,7 @@ def build_args(args): for i in range(1, 7): args["--tol"] = "1E-" + str(i) print("Running with tol = 1E-" + str(i)) - for method in ['4', '2']: + for method in ['2', '1']: args["--method"] = method if method == '0': data['method'].append('auto') From c1b14c66b34e737dc6ce48a2ab2e7d997c0b0187 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 11:22:39 -0400 Subject: [PATCH 14/39] changed matlab to generate necessary cuda upsampfact files --- devel/gen_all_horner_C_code.m | 8 ++++---- devel/gen_ker_horner_loop_C_code.m | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 5ac28cb95..360725570 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -9,11 +9,11 @@ opts = struct(); ws = 2:16; -upsampfac = 2; % sigma (upsampling): either 2 (default) or low (eg 5/4). -opts.wpad = true; % pad kernel eval to multiple of 4 +upsampfac = 1.25; % sigma (upsampling): either 2 (default) or low (eg 5/4). +opts.wpad = false; % pad kernel eval to multiple of 4 -if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop.c','w'); -else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop.c','w'); +if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w'); +else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m index 12fe74baa..9c0b6d1ed 100644 --- a/devel/gen_ker_horner_loop_C_code.m +++ b/devel/gen_ker_horner_loop_C_code.m @@ -35,7 +35,7 @@ width = w; end for n=1:d % loop over poly coeff powers - s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1)); + s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1)); for i=2:width % loop over segments s = sprintf('%s, %.16E', s, C(n,i)); end From f300d2d8839cdc51381ef6516c18e25aeb1060ab Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 11:26:29 -0400 Subject: [PATCH 15/39] added new coeffs --- .../contrib/ker_horner_allw_loop.inc | 389 ++++++++---------- .../ker_lowupsampfac_horner_allw_loop.inc | 192 +++++++++ 2 files changed, 375 insertions(+), 206 deletions(-) create mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 1178a8544..953c4618b 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -1,230 +1,207 @@ // Code generated by gen_all_horner_C_code.m in finufft/devel // Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) 2018, The Simons Foundation, Inc. +// (C) The Simons Foundation, Inc. if (w==2) { - constexpr CUFINUFFT_FLT c0[] = {4.5147043243215343E+01, 4.5147043243215336E+01}; - constexpr CUFINUFFT_FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01}; - constexpr CUFINUFFT_FLT c2[] = {-1.8395117920046662E+00, -1.8395117920046617E+00}; - constexpr CUFINUFFT_FLT c3[] = {-2.0382426253182079E+01, 2.0382426253182079E+01}; - constexpr CUFINUFFT_FLT c4[] = {-2.0940804433577291E+00, -2.0940804433577358E+00}; - constexpr CUFINUFFT_FLT c5[] = {3.1328044596872613E+00, -3.1328044596872546E+00}; - for (int i = 0; i < 2; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, c5[i], c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; + constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; + constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; + constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; + constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); } else if (w==3) { - constexpr CUFINUFFT_FLT c0[] = {1.5653991189315124E+02, 8.8006872410780340E+02, 1.5653991189967161E+02}; - constexpr CUFINUFFT_FLT c1[] = {3.1653018869611071E+02, 2.1722031447974492E-14, -3.1653018868907077E+02}; - constexpr CUFINUFFT_FLT c2[] = {1.7742692790454473E+02, -3.3149255274727807E+02, 1.7742692791117116E+02}; - constexpr CUFINUFFT_FLT c3[] = {-1.5357716116473128E+01, -5.1917435849174007E-16, 1.5357716122720189E+01}; - constexpr CUFINUFFT_FLT c4[] = {-3.7757583061523604E+01, 5.3222970968867436E+01, -3.7757583054647363E+01}; - constexpr CUFINUFFT_FLT c5[] = {-3.9654011076088960E+00, 6.0642442697108023E-14, 3.9654011139270056E+00}; - constexpr CUFINUFFT_FLT c6[] = {3.3694352031960180E+00, -4.8817394017826032E+00, 3.3694352094301192E+00}; -for (int i=0; i<3; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c6[i], c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); -} + constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; + constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; + constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; + constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; + constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; + constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==4) { - constexpr CUFINUFFT_FLT c0[] = {5.4284366850213223E+02, 1.0073871433088403E+04, 1.0073871433088401E+04, 5.4284366850213223E+02}; - constexpr CUFINUFFT_FLT c1[] = {1.4650917259256937E+03, 6.1905285583602872E+03, -6.1905285583602890E+03, -1.4650917259256942E+03}; - constexpr CUFINUFFT_FLT c2[] = {1.4186910680718343E+03, -1.3995339862725584E+03, -1.3995339862725591E+03, 1.4186910680718338E+03}; - constexpr CUFINUFFT_FLT c3[] = {5.1133995502497419E+02, -1.4191608683682987E+03, 1.4191608683682980E+03, -5.1133995502497419E+02}; - constexpr CUFINUFFT_FLT c4[] = {-4.8293622641173549E+01, 3.9393732546136526E+01, 3.9393732546137308E+01, -4.8293622641173634E+01}; - constexpr CUFINUFFT_FLT c5[] = {-7.8386867802392118E+01, 1.4918904800408907E+02, -1.4918904800408754E+02, 7.8386867802392175E+01}; - constexpr CUFINUFFT_FLT c6[] = {-1.0039212571700762E+01, 5.0626747735616444E+00, 5.0626747735613531E+00, -1.0039212571700721E+01}; - constexpr CUFINUFFT_FLT c7[] = {4.7282853097645736E+00, -9.5966330409183929E+00, 9.5966330409170837E+00, -4.7282853097647068E+00}; - for (int i=0; i<4; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c7[i], c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; + constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; + constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; + constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; + constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; + constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; + constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==5) { - constexpr CUFINUFFT_FLT c0[] = {9.9223677575398324E+02, 3.7794697666613341E+04, 9.8715771010760523E+04, 3.7794697666613290E+04, 9.9223677575398494E+02}; - constexpr CUFINUFFT_FLT c1[] = {3.0430174925083820E+03, 3.7938404259811403E+04, 2.7804200253407354E-12, -3.7938404259811381E+04, -3.0430174925083838E+03}; - constexpr CUFINUFFT_FLT c2[] = {3.6092689177271218E+03, 7.7501368899498566E+03, -2.2704627332474989E+04, 7.7501368899498684E+03, 3.6092689177271227E+03}; - constexpr CUFINUFFT_FLT c3[] = {1.9990077310495410E+03, -3.8875294641277278E+03, 3.8628399128660033E-12, 3.8875294641277342E+03, -1.9990077310495410E+03}; - constexpr CUFINUFFT_FLT c4[] = {4.0071733590403858E+02, -1.5861137916762520E+03, 2.3839858699098813E+03, -1.5861137916762589E+03, 4.0071733590403880E+02}; - constexpr CUFINUFFT_FLT c5[] = {-9.1301168206167731E+01, 1.2316471075214690E+02, 1.0425607383569405E-11, -1.2316471075215136E+02, 9.1301168206167446E+01}; - constexpr CUFINUFFT_FLT c6[] = {-5.5339722671223782E+01, 1.1960590540261434E+02, -1.5249941358312017E+02, 1.1960590540261727E+02, -5.5339722671222638E+01}; - constexpr CUFINUFFT_FLT c7[] = {-3.3762488150349701E+00, 2.2839981872969930E+00, 3.9507985966337744E-12, -2.2839981872938613E+00, 3.3762488150346224E+00}; - constexpr CUFINUFFT_FLT c8[] = {2.5183531846827609E+00, -5.3664382310942162E+00, 6.6969190369431528E+00, -5.3664382311060113E+00, 2.5183531846825087E+00}; - for (int i=0; i<5; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c8[i], c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; + constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; + constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; + constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; + constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; + constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; + constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; + constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==6) { - constexpr CUFINUFFT_FLT c0[] = {2.0553833234911881E+03, 1.5499537739913142E+05, 8.1177907023291197E+05, 8.1177907023291243E+05, 1.5499537739913136E+05, 2.0553833235005709E+03}; - constexpr CUFINUFFT_FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917662E+05, -3.1559612614917639E+05, -2.0581923258843314E+05, -7.1269776034341376E+03}; - constexpr CUFINUFFT_FLT c2[] = {1.0023404568475091E+04, 9.0916650498360163E+04, -1.0095927514054625E+05, -1.0095927514054641E+05, 9.0916650498360133E+04, 1.0023404568484631E+04}; - constexpr CUFINUFFT_FLT c3[] = {7.2536109410387417E+03, 4.8347162752603172E+03, -5.0512736602018493E+04, 5.0512736602018464E+04, -4.8347162752602935E+03, -7.2536109410297549E+03}; - constexpr CUFINUFFT_FLT c4[] = {2.7021878300949775E+03, -7.8773465553971982E+03, 5.2105876478344171E+03, 5.2105876478344435E+03, -7.8773465553972501E+03, 2.7021878301048719E+03}; - constexpr CUFINUFFT_FLT c5[] = {3.2120291706547602E+02, -1.8229189469937089E+03, 3.7928113414428362E+03, -3.7928113414427862E+03, 1.8229189469936987E+03, -3.2120291705638107E+02}; - constexpr CUFINUFFT_FLT c6[] = {-1.2051267090537493E+02, 2.2400507411396228E+02, -1.2506575852544464E+02, -1.2506575852534223E+02, 2.2400507411397808E+02, -1.2051267089640046E+02}; - constexpr CUFINUFFT_FLT c7[] = {-4.5977202613351125E+01, 1.1536880606853479E+02, -1.7819720186493950E+02, 1.7819720186493225E+02, -1.1536880606854527E+02, 4.5977202622148695E+01}; - constexpr CUFINUFFT_FLT c8[] = {-1.5631081288828985E+00, 7.1037430592828998E-01, -6.9838401131851052E-02, -6.9838401215353244E-02, 7.1037430589405925E-01, -1.5631081203763799E+00}; - constexpr CUFINUFFT_FLT c9[] = {1.7872002109952807E+00, -4.0452381056429791E+00, 5.8969107680858182E+00, -5.8969107681844992E+00, 4.0452381056487843E+00, -1.7872002036951482E+00}; - for (int i=0; i<6; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c9[i], c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; + constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; + constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; + constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; + constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; + constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; + constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; + constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; + constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==7) { - constexpr CUFINUFFT_FLT c0[] = {3.9948351830487572E+03, 5.4715865608590818E+05, 5.0196413492771797E+06, 9.8206709220713284E+06, 5.0196413492771862E+06, 5.4715865608590830E+05, 3.9948351830642591E+03}; - constexpr CUFINUFFT_FLT c1[] = {1.5290160332974685E+04, 8.7628248584320396E+05, 3.4421061790934447E+06, -1.3062175007082776E-26, -3.4421061790934466E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}; - constexpr CUFINUFFT_FLT c2[] = {2.4458227486779248E+04, 5.3904618484139408E+05, 2.4315566181017426E+05, -1.6133959371974319E+06, 2.4315566181017403E+05, 5.3904618484139384E+05, 2.4458227486795098E+04}; - constexpr CUFINUFFT_FLT c3[] = {2.1166189345881645E+04, 1.3382732160223144E+05, -3.3113450969689671E+05, -6.5160817568418758E-10, 3.3113450969689724E+05, -1.3382732160223127E+05, -2.1166189345866882E+04}; - constexpr CUFINUFFT_FLT c4[] = {1.0542795672344866E+04, -7.0739172265096213E+03, -6.5563293056048453E+04, 1.2429734005960147E+05, -6.5563293056048846E+04, -7.0739172265096058E+03, 1.0542795672361211E+04}; - constexpr CUFINUFFT_FLT c5[] = {2.7903491906228414E+03, -1.0975382873973065E+04, 1.3656979541144814E+04, 1.2638008605419305E-09, -1.3656979541144177E+04, 1.0975382873973065E+04, -2.7903491906078302E+03}; - constexpr CUFINUFFT_FLT c6[] = {1.6069721418053450E+02, -1.5518707872250775E+03, 4.3634273936637373E+03, -5.9891976420593228E+03, 4.3634273936637110E+03, -1.5518707872251396E+03, 1.6069721419533406E+02}; - constexpr CUFINUFFT_FLT c7[] = {-1.2289277373867886E+02, 2.8583630927743752E+02, -2.8318194617301111E+02, -8.6523823682922648E-10, 2.8318194617373905E+02, -2.8583630927755564E+02, 1.2289277375320185E+02}; - constexpr CUFINUFFT_FLT c8[] = {-3.2270164914248042E+01, 9.1892112257600488E+01, -1.6710678096332572E+02, 2.0317049305437533E+02, -1.6710678096375165E+02, 9.1892112257478516E+01, -3.2270164900225943E+01}; - constexpr CUFINUFFT_FLT c9[] = {-1.4761409684737312E-01, -9.1862771282699363E-01, 1.2845147738991460E+00, 2.0325596081255337E-10, -1.2845147731561355E+00, 9.1862771288504130E-01, 1.4761410890750706E-01}; - constexpr CUFINUFFT_FLT c10[] = {1.0330620799191630E+00, -2.6798144967451138E+00, 4.4142511561803381E+00, -5.1799254918189979E+00, 4.4142511544246821E+00, -2.6798144968294695E+00, 1.0330620914479023E+00}; - for (int i=0; i<7; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; + constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; + constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; + constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; + constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; + constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; + constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; + constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; + constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; + constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==8) { - constexpr CUFINUFFT_FLT c0[] = {7.3898000697447951E+03, 1.7297637497600042E+06, 2.5578341605285816E+07, 8.4789650417103380E+07, 8.4789650417103380E+07, 2.5578341605285820E+07, 1.7297637497600049E+06, 7.3898000697448042E+03}; - constexpr CUFINUFFT_FLT c1[] = {3.0719636811267595E+04, 3.1853145713323937E+06, 2.3797981861403696E+07, 2.4569731244678468E+07, -2.4569731244678464E+07, -2.3797981861403700E+07, -3.1853145713323932E+06, -3.0719636811267599E+04}; - constexpr CUFINUFFT_FLT c2[] = {5.4488498478251720E+04, 2.4101183255475122E+06, 6.4554051283428278E+06, -8.9200440393090621E+06, -8.9200440393090658E+06, 6.4554051283428278E+06, 2.4101183255475122E+06, 5.4488498478251720E+04}; - constexpr CUFINUFFT_FLT c3[] = {5.3926359802542131E+04, 9.0469037926849292E+05, -6.0897036277696094E+05, -3.0743852105800072E+06, 3.0743852105800039E+06, 6.0897036277696339E+05, -9.0469037926849292E+05, -5.3926359802542116E+04}; - constexpr CUFINUFFT_FLT c4[] = {3.2444118016247583E+04, 1.3079802224392195E+05, -5.8652889370128501E+05, 4.2333306008153502E+05, 4.2333306008153904E+05, -5.8652889370128524E+05, 1.3079802224392162E+05, 3.2444118016247587E+04}; - constexpr CUFINUFFT_FLT c5[] = {1.1864306345505289E+04, -2.2700360645707628E+04, -5.0713607251413239E+04, 1.8308704458211805E+05, -1.8308704458211269E+05, 5.0713607251412053E+04, 2.2700360645707922E+04, -1.1864306345505289E+04}; - constexpr CUFINUFFT_FLT c6[] = {2.2812256770903182E+03, -1.1569135767378117E+04, 2.0942387020799080E+04, -1.1661592834949530E+04, -1.1661592834949715E+04, 2.0942387020801576E+04, -1.1569135767377431E+04, 2.2812256770903446E+03}; - constexpr CUFINUFFT_FLT c7[] = {8.5503535636805026E+00, -9.7513976461269635E+02, 3.8242995179157779E+03, -6.9201295567256420E+03, 6.9201295567222760E+03, -3.8242995179195914E+03, 9.7513976461218783E+02, -8.5503535636857091E+00}; - constexpr CUFINUFFT_FLT c8[] = {-1.0230637348345583E+02, 2.8246898554291380E+02, -3.8638201738179225E+02, 1.9106407993005959E+02, 1.9106407993232122E+02, -3.8638201738334749E+02, 2.8246898554236805E+02, -1.0230637348345877E+02}; - constexpr CUFINUFFT_FLT c9[] = {-1.9200143062948566E+01, 6.1692257626799076E+01, -1.2981109187842986E+02, 1.8681284209951576E+02, -1.8681284210285929E+02, 1.2981109187694383E+02, -6.1692257626659767E+01, 1.9200143062946392E+01}; - constexpr CUFINUFFT_FLT c10[] = {3.7894993760901435E-01, -1.7334408837152924E+00, 2.5271184066312142E+00, -1.2600963963387819E+00, -1.2600963946516730E+00, 2.5271184093306061E+00, -1.7334408836731170E+00, 3.7894993761824158E-01}; - for (int i = 0; i < 8; i++) { - ker[i] = fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, fma(z, c10[i], c9[i]), c8[i]), c7[i]), c6[i]), c5[i]), c4[i]), c3[i]), c2[i]), c1[i]), c0[i]); - } + constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; + constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; + constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; + constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; + constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; + constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; + constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; + constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; + constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; + constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; + constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==9) { - constexpr CUFINUFFT_FLT c0[] = {1.3136365370186117E+04, 5.0196413492771843E+06, 1.1303327711722571E+08, 5.8225443924996734E+08, 9.7700272582690704E+08, 5.8225443924996817E+08, 1.1303327711722572E+08, 5.0196413492772235E+06, 1.3136365370186102E+04}; - constexpr CUFINUFFT_FLT c1[] = {5.8623313038274340E+04, 1.0326318537280340E+07, 1.2898448324824861E+08, 3.0522863709830379E+08, 2.2777200847591304E-08, -3.0522863709830391E+08, -1.2898448324824867E+08, -1.0326318537280390E+07, -5.8623313038274362E+04}; - constexpr CUFINUFFT_FLT c2[] = {1.1335001341875963E+05, 9.0726133144784775E+06, 5.3501544534038082E+07, -2.6789524644150439E+05, -1.2483923718899380E+08, -2.6789524644173466E+05, 5.3501544534038067E+07, 9.0726133144785129E+06, 1.1335001341875964E+05}; - constexpr CUFINUFFT_FLT c3[] = {1.2489113703229750E+05, 4.3035547171861930E+06, 6.3021978510598894E+06, -2.6014941986659020E+07, 2.8258041381448560E-08, 2.6014941986659355E+07, -6.3021978510598978E+06, -4.3035547171862079E+06, -1.2489113703229750E+05}; - constexpr CUFINUFFT_FLT c4[] = {8.6425493435991229E+04, 1.0891182836653332E+06, -2.0713033564200329E+06, -2.8994941183505855E+06, 7.5905338661207352E+06, -2.8994941183504057E+06, -2.0713033564200525E+06, 1.0891182836653360E+06, 8.6425493435991244E+04}; - constexpr CUFINUFFT_FLT c5[] = {3.8657354724013807E+04, 7.9936390113327987E+04, -7.0458265546792350E+05, 1.0151095605715724E+06, 8.7808418931366203E-08, -1.0151095605718571E+06, 7.0458265546792292E+05, -7.9936390113333473E+04, -3.8657354724013807E+04}; - constexpr CUFINUFFT_FLT c6[] = {1.0779131453134632E+04, -3.3466718311303863E+04, -1.3245366619006214E+04, 1.8238470515351585E+05, -2.9285656292984058E+05, 1.8238470515350348E+05, -1.3245366619016511E+04, -3.3466718311298035E+04, 1.0779131453134652E+04}; - constexpr CUFINUFFT_FLT c7[] = {1.4992527030548451E+03, -9.7024371533906651E+03, 2.3216330734046409E+04, -2.3465262819075571E+04, -3.7031099746142328E-08, 2.3465262819179152E+04, -2.3216330734079289E+04, 9.7024371533883768E+03, -1.4992527030548429E+03}; - constexpr CUFINUFFT_FLT c8[] = {-7.9857427421137089E+01, -4.0585588534737309E+02, 2.6054813773474157E+03, -6.1806593581211082E+03, 8.0679596873751289E+03, -6.1806593581509942E+03, 2.6054813773256465E+03, -4.0585588535330419E+02, -7.9857427421164303E+01}; - constexpr CUFINUFFT_FLT c9[] = {-7.1572272057931258E+01, 2.2785637019446185E+02, -3.9109820765219445E+02, 3.3597424707607246E+02, 1.7793576396134983E-08, -3.3597424727519928E+02, 3.9109820766111056E+02, -2.2785637019102543E+02, 7.1572272057951565E+01}; - constexpr CUFINUFFT_FLT c10[] = {-9.8886360698029030E+00, 3.5359026948517517E+01, -8.5251867695464824E+01, 1.4285748015591199E+02, -1.6935269673908536E+02, 1.4285748008591776E+02, -8.5251867720434134E+01, 3.5359026945818123E+01, -9.8886360698009241E+00}; - constexpr CUFINUFFT_FLT c11[] = {5.4050464453063796E-01, -1.7215219066697895E+00, 2.8631741265441102E+00, -2.3817977385844018E+00, -1.0173343205540475E-08, 2.3817977172440110E+00, -2.8631741497139487E+00, 1.7215219081941548E+00, -5.4050464453541269E-01}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; + constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; + constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; + constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; + constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; + constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; + constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; + constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; + constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; + constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; + constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==10) { - constexpr CUFINUFFT_FLT c0[] = {2.2594586605749279E+04, 1.3595989066786604E+07, 4.4723032442444921E+08, 3.3781755837397542E+09, 8.6836783895849838E+09, 8.6836783895849819E+09, 3.3781755837397518E+09, 4.4723032442444921E+08, 1.3595989066786485E+07, 2.2594586605749315E+04}; - constexpr CUFINUFFT_FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130477E+08, 2.4434902657508340E+09, 2.0073077861288924E+09, -2.0073077861288958E+09, -2.4434902657508330E+09, -5.9387966085130465E+08, -3.0651490267742820E+07, -1.0729981697645631E+05}; - constexpr CUFINUFFT_FLT c2[] = {2.2340399734184594E+05, 3.0258214643190444E+07, 3.1512411458738214E+08, 4.3618276932319784E+08, -7.8178848450497377E+08, -7.8178848450497079E+08, 4.3618276932319820E+08, 3.1512411458738226E+08, 3.0258214643190306E+07, 2.2340399734184553E+05}; - constexpr CUFINUFFT_FLT c3[] = {2.6917433004353492E+05, 1.6875651476661235E+07, 7.4664745481963485E+07, -9.5882157211117983E+07, -2.0622994435532546E+08, 2.0622994435532695E+08, 9.5882157211117893E+07, -7.4664745481963441E+07, -1.6875651476661157E+07, -2.6917433004353417E+05}; - constexpr CUFINUFFT_FLT c4[] = {2.0818422772177903E+05, 5.6084730690362593E+06, 1.4435118192352918E+06, -4.0063869969543688E+07, 3.2803674392747905E+07, 3.2803674392747425E+07, -4.0063869969546065E+07, 1.4435118192351861E+06, 5.6084730690362072E+06, 2.0818422772177853E+05}; - constexpr CUFINUFFT_FLT c5[] = {1.0781139496011089E+05, 9.9202615851199115E+05, -3.3266265543962144E+06, -4.8557049011465441E+05, 1.0176155522771550E+07, -1.0176155522773480E+07, 4.8557049011624791E+05, 3.3266265543963145E+06, -9.9202615851196367E+05, -1.0781139496011069E+05}; - constexpr CUFINUFFT_FLT c6[] = {3.7380102688153507E+04, 1.2716675000354149E+04, -6.2163527451780590E+05, 1.4157962667182824E+06, -8.4419693137806712E+05, -8.4419693137792684E+05, 1.4157962667183836E+06, -6.2163527451768133E+05, 1.2716675000338953E+04, 3.7380102688153551E+04}; - constexpr CUFINUFFT_FLT c7[] = {8.1238936393894865E+03, -3.4872365530450799E+04, 2.3913680325180554E+04, 1.2428850301840073E+05, -3.2158255329732876E+05, 3.2158255329921009E+05, -1.2428850301906197E+05, -2.3913680325219862E+04, 3.4872365530457639E+04, -8.1238936393893855E+03}; - constexpr CUFINUFFT_FLT c8[] = {7.8515926628983277E+02, -6.6607899119362401E+03, 2.0167398338517272E+04, -2.8951401344174039E+04, 1.4622828141519254E+04, 1.4622828143473866E+04, -2.8951401346529910E+04, 2.0167398338405819E+04, -6.6607899119515532E+03, 7.8515926628964587E+02}; - constexpr CUFINUFFT_FLT c9[] = {-1.0147176570533524E+02, -3.5304284183527621E+01, 1.3576976854816689E+03, -4.3921059353471846E+03, 7.3232085265419046E+03, -7.3232085280635902E+03, 4.3921059363220147E+03, -1.3576976854281722E+03, 3.5304284184270628E+01, 1.0147176570551520E+02}; - constexpr CUFINUFFT_FLT c10[] = {-4.3161545259395531E+01, 1.5498490982051828E+02, -3.1771250772612478E+02, 3.7215448793727404E+02, -1.7181762882439287E+02, -1.7181763008770599E+02, 3.7215448759715150E+02, -3.1771250770992856E+02, 1.5498490982321766E+02, -4.3161545259481535E+01}; - constexpr CUFINUFFT_FLT c11[] = {-4.2916172038404330E+00, 1.7402146068709751E+01, -4.7947588102062113E+01, 9.2697697983158491E+01, -1.2821427595919303E+02, 1.2821427694451660E+02, -9.2697698629471930E+01, 4.7947588133767717E+01, -1.7402146075416606E+01, 4.2916172038784923E+00}; - constexpr CUFINUFFT_FLT c12[] = {3.5357495062947814E-01, -1.2828127005767840E+00, 2.4090120532215455E+00, -2.6448901913160028E+00, 1.1811546776400381E+00, 1.1811568523765217E+00, -2.6448918925210712E+00, 2.4090119216851607E+00, -1.2828127015358992E+00, 3.5357495059093369E-01}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; + constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; + constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; + constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; + constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; + constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; + constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; + constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; + constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; + constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; + constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; + constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==11) { - constexpr CUFINUFFT_FLT c0[] = {3.7794653219809574E+04, 3.4782300224660799E+07, 1.6188020733727567E+09, 1.7196758809615021E+10, 6.3754384857724678E+10, 9.7196447559193558E+10, 6.3754384857724640E+10, 1.7196758809615005E+10, 1.6188020733727570E+09, 3.4782300224660806E+07, 3.7794653219808897E+04}; - constexpr CUFINUFFT_FLT c1[] = {1.8969206922085880E+05, 8.4769319065313682E+07, 2.4230555767723408E+09, 1.5439732722639105E+10, 2.7112836839612309E+10, 2.9154817084916870E-06, -2.7112836839612320E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05}; - constexpr CUFINUFFT_FLT c2[] = {4.2138380313901423E+05, 9.2050522922791898E+07, 1.5259983101266611E+09, 4.7070559561237154E+09, -1.2448027572952452E+09, -1.0161446790279312E+10, -1.2448027572952352E+09, 4.7070559561237249E+09, 1.5259983101266615E+09, 9.2050522922791868E+07, 4.2138380313901143E+05}; - constexpr CUFINUFFT_FLT c3[] = {5.4814313598122017E+05, 5.8085130777589574E+07, 4.9484006166551107E+08, 1.6222124676640958E+08, -2.0440440381345322E+09, -1.0628188648962249E-06, 2.0440440381345263E+09, -1.6222124676641047E+08, -4.9484006166551083E+08, -5.8085130777589560E+07, -5.4814313598121691E+05}; - constexpr CUFINUFFT_FLT c4[] = {4.6495183529254969E+05, 2.3067199578027174E+07, 6.9832590192482829E+07, -2.2024799260683161E+08, -1.2820270942587741E+08, 5.1017181199130940E+08, -1.2820270942587276E+08, -2.2024799260684022E+08, 6.9832590192482591E+07, 2.3067199578027155E+07, 4.6495183529254753E+05}; - constexpr CUFINUFFT_FLT c5[] = {2.7021781043532968E+05, 5.6764510325100143E+06, -5.5650761736747762E+06, -3.9907385617900737E+07, 7.2453390663686648E+07, 3.7361048615190248E-06, -7.2453390663685605E+07, 3.9907385617898554E+07, 5.5650761736747930E+06, -5.6764510325100180E+06, -2.7021781043532834E+05}; - constexpr CUFINUFFT_FLT c6[] = {1.0933249308680615E+05, 6.9586821127986431E+05, -3.6860240321940281E+06, 2.7428169457723838E+06, 8.3392008440598147E+06, -1.6402201025051240E+07, 8.3392008440649221E+06, 2.7428169457788388E+06, -3.6860240321937916E+06, 6.9586821127989038E+05, 1.0933249308680584E+05}; - constexpr CUFINUFFT_FLT c7[] = {3.0203516161820480E+04, -3.6879059542777912E+04, -4.1141031216801296E+05, 1.4111389975270075E+06, -1.5914376635392811E+06, 6.6766157119460594E-07, 1.5914376635341521E+06, -1.4111389975270815E+06, 4.1141031216760987E+05, 3.6879059542751726E+04, -3.0203516161820367E+04}; - constexpr CUFINUFFT_FLT c8[] = {5.1670143574922804E+03, -2.8613147115365118E+04, 4.3560195427108687E+04, 4.8438679581840552E+04, -2.5856630639330545E+05, 3.7994883866097208E+05, -2.5856630640124826E+05, 4.8438679578319818E+04, 4.3560195426824532E+04, -2.8613147115371667E+04, 5.1670143574923577E+03}; - constexpr CUFINUFFT_FLT c9[] = {3.0888018539742444E+02, -3.7949446187516196E+03, 1.4313303205035631E+04, -2.6681600236925929E+04, 2.3856005161221132E+04, -2.3276789125970764E-06, -2.3856005160840708E+04, 2.6681600234072768E+04, -1.4313303205083184E+04, 3.7949446187479048E+03, -3.0888018539723868E+02}; - constexpr CUFINUFFT_FLT c10[] = {-8.3747489794255131E+01, 1.1948077479810485E+02, 4.8528498025870488E+02, -2.5024391115619069E+03, 5.3511195350414373E+03, -6.7655484152307990E+03, 5.3511195328171416E+03, -2.5024391120801879E+03, 4.8528498023710927E+02, 1.1948077481025226E+02, -8.3747489794331599E+01}; - constexpr CUFINUFFT_FLT c11[] = {-2.2640047135555928E+01, 9.0840898549317998E+01, -2.1597187568776889E+02, 3.1511229085836396E+02, -2.4856618287164540E+02, 1.6489710183426948E-06, 2.4856618404233313E+02, -3.1511228957061689E+02, 2.1597187534632059E+02, -9.0840898568829203E+01, 2.2640047135641577E+01}; - constexpr CUFINUFFT_FLT c12[] = {-1.6306382885945303E+00, 7.3325946569413265E+00, -2.3241017814397217E+01, 5.1715493697385526E+01, -8.2673003927086967E+01, 9.6489715222659115E+01, -8.2673013187251925E+01, 5.1715492855550593E+01, -2.3241018165160245E+01, 7.3325946421432624E+00, -1.6306382886373367E+00}; - constexpr CUFINUFFT_FLT c13[] = {2.4409286936442823E-01, -7.8803147249892458E-01, 1.6467143668339987E+00, -2.1898241453519685E+00, 1.6350102449767006E+00, -1.1782931558589478E-06, -1.6350139430218933E+00, 2.1898230913723329E+00, -1.6467144225690411E+00, 7.8803147709023735E-01, -2.4409286927983653E-01}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; + constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; + constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; + constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; + constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; + constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; + constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; + constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; + constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; + constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; + constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; + constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; + constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==12) { - constexpr CUFINUFFT_FLT c0[] = {6.1722991679853112E+04, 8.4789650417103723E+07, 5.4431675199498730E+09, 7.8788892335272293E+10, 4.0355760945670062E+11, 8.8071481911347974E+11, 8.8071481911347998E+11, 4.0355760945670068E+11, 7.8788892335272491E+10, 5.4431675199498854E+09, 8.4789650417103767E+07, 6.1722991679871629E+04}; - constexpr CUFINUFFT_FLT c1[] = {3.2561466099406150E+05, 2.2112758120210624E+08, 8.9911609880089817E+09, 8.3059508064200928E+10, 2.3965569143469864E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210624E+08, -3.2561466099404282E+05}; - constexpr CUFINUFFT_FLT c2[] = {7.6621098001581465E+05, 2.6026568260310274E+08, 6.4524338253008652E+09, 3.3729904113826797E+10, 2.8555202212474010E+10, -6.8998572040731583E+10, -6.8998572040731506E+10, 2.8555202212474064E+10, 3.3729904113826805E+10, 6.4524338253008747E+09, 2.6026568260310277E+08, 7.6621098001583852E+05}; - constexpr CUFINUFFT_FLT c3[] = {1.0657807616803222E+06, 1.8144472126890999E+08, 2.5524827004349856E+09, 5.2112383911371746E+09, -1.0268350564014614E+10, -1.4763245309081245E+10, 1.4763245309081299E+10, 1.0268350564014664E+10, -5.2112383911371031E+09, -2.5524827004349875E+09, -1.8144472126890990E+08, -1.0657807616803090E+06}; - constexpr CUFINUFFT_FLT c4[] = {9.7829638830158743E+05, 8.2222351241519973E+07, 5.5676911894064891E+08, -4.8739037675424922E+08, -2.7153428193077750E+09, 2.5627633609246840E+09, 2.5627633609247112E+09, -2.7153428193078070E+09, -4.8739037675429451E+08, 5.5676911894064677E+08, 8.2222351241519928E+07, 9.7829638830161165E+05}; - constexpr CUFINUFFT_FLT c5[] = {6.2536876825113979E+05, 2.4702814073680263E+07, 4.1488431554845832E+07, -2.9274790542418414E+08, 1.0742154109193267E+08, 6.2185168968029702E+08, -6.2185168968023658E+08, -1.0742154109185636E+08, 2.9274790542422676E+08, -4.1488431554844096E+07, -2.4702814073680244E+07, -6.2536876825112442E+05}; - constexpr CUFINUFFT_FLT c6[] = {2.8527714307528478E+05, 4.6266378435690189E+06, -1.0665598090791209E+07, -2.6048960239906937E+07, 9.1597254427339226E+07, -5.9794495983323507E+07, -5.9794495983287223E+07, 9.1597254427330941E+07, -2.6048960239925586E+07, -1.0665598090793334E+07, 4.6266378435690831E+06, 2.8527714307530422E+05}; - constexpr CUFINUFFT_FLT c7[] = {9.2873647411234240E+04, 3.6630046787428786E+05, -3.1271047224731087E+06, 4.8612412939261831E+06, 3.3820440907802135E+06, -1.6880127953711823E+07, 1.6880127953682471E+07, -3.3820440907974164E+06, -4.8612412939092657E+06, 3.1271047224737639E+06, -3.6630046787430649E+05, -9.2873647411216807E+04}; - constexpr CUFINUFFT_FLT c8[] = {2.0817947751046187E+04, -5.5660303410280452E+04, -1.9519783923293054E+05, 1.0804817251338358E+06, -1.8264985852948832E+06, 9.7602844964432076E+05, 9.7602844962242560E+05, -1.8264985853129351E+06, 1.0804817251129062E+06, -1.9519783923449527E+05, -5.5660303410338929E+04, 2.0817947751063308E+04}; - constexpr CUFINUFFT_FLT c9[] = {2.7986023314784748E+03, -1.9404411093600604E+04, 4.3922624999853564E+04, -7.6450317375817094E+03, -1.5273911976404345E+05, 3.3223441450299282E+05, -3.3223441454103496E+05, 1.5273911977621692E+05, 7.6450317497551932E+03, -4.3922624998426982E+04, 1.9404411093646668E+04, -2.7986023314644040E+03}; - constexpr CUFINUFFT_FLT c10[] = {6.7849020474186844E+01, -1.7921351307934926E+03, 8.4980694693463538E+03, -1.9742624859078383E+04, 2.4620674878200782E+04, -1.1676544885779787E+04, -1.1676544871958942E+04, 2.4620674838120303E+04, -1.9742624835582923E+04, 8.4980694640771490E+03, -1.7921351307934922E+03, 6.7849020488748664E+01}; - constexpr CUFINUFFT_FLT c11[] = {-5.4577020998847871E+01, 1.3637112866755427E+02, 4.5513615487589092E+01, -1.1174001343792290E+03, 3.2018769324922364E+03, -5.0580351333780654E+03, 5.0580351424313239E+03, -3.2018769362383905E+03, 1.1174000937955741E+03, -4.5513610843875405E+01, -1.3637112870657899E+02, 5.4577021011919037E+01}; - constexpr CUFINUFFT_FLT c12[] = {-1.0538365872424132E+01, 4.6577222490846609E+01, -1.2606964180937365E+02, 2.1881091191930210E+02, -2.3273402308837001E+02, 1.0274273857329082E+02, 1.0274268020620094E+02, -2.3273404553726701E+02, 2.1881091276113446E+02, -1.2606964815819696E+02, 4.6577222438230805E+01, -1.0538365860846021E+01}; - constexpr CUFINUFFT_FLT c13[] = {-4.6087004128022252E-01, 2.5969759424153827E+00, -9.6946930749915676E+00, 2.4990050007153755E+01, -4.6013920149683365E+01, 6.2056948047986317E+01, -6.2056981293939970E+01, 4.6013908245461884E+01, -2.4990038356462701E+01, 9.6946952377382889E+00, -2.5969759165384922E+00, 4.6087004737535314E-01}; + constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; + constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; + constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; + constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; + constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; + constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; + constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; + constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; + constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; + constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; + constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; + constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; + constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; + constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - constexpr CUFINUFFT_FLT c0[] = {9.8715725867495639E+04, 1.9828875496808118E+08, 1.7196758809614998E+10, 3.3083776881353607E+11, 2.2668873993375444E+12, 6.7734720591167598E+12, 9.6695220682534824E+12, 6.7734720591167471E+12, 2.2668873993375439E+12, 3.3083776881353534E+11, 1.7196758809614998E+10, 1.9828875496807906E+08, 9.8715725867495537E+04}; - constexpr CUFINUFFT_FLT c1[] = {5.4491110456935503E+05, 5.4903670125539362E+08, 3.0879465445278172E+10, 3.9588436413399951E+11, 1.6860562536749778E+12, 2.4256447893117881E+12, 3.7318165868693593E-04, -2.4256447893117856E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538874E+08, -5.4491110456935491E+05}; - constexpr CUFINUFFT_FLT c2[] = {1.3504711883426066E+06, 6.9286979077463162E+08, 2.4618123595484562E+10, 1.9493985627722598E+11, 3.9422703517046326E+11, -1.8678883613919931E+11, -8.5538079834550146E+11, -1.8678883613919705E+11, 3.9422703517046338E+11, 1.9493985627722586E+11, 2.4618123595484554E+10, 6.9286979077462578E+08, 1.3504711883426069E+06}; - constexpr CUFINUFFT_FLT c3[] = {1.9937206140846494E+06, 5.2512029493765986E+08, 1.1253303793811754E+10, 4.6205527735932175E+10, -1.1607472377983284E+10, -1.6305241755642325E+11, 1.3350300616010507E-04, 1.6305241755642365E+11, 1.1607472377982744E+10, -4.6205527735932228E+10, -1.1253303793811750E+10, -5.2512029493765610E+08, -1.9937206140846484E+06}; - constexpr CUFINUFFT_FLT c4[] = {1.9607419630386413E+06, 2.6425362558103889E+08, 3.1171259341747255E+09, 2.9839860297840505E+09, -1.9585031917561890E+10, -5.0666917387055302E+09, 3.6568794485482079E+10, -5.0666917387051382E+09, -1.9585031917561581E+10, 2.9839860297839398E+09, 3.1171259341747217E+09, 2.6425362558103737E+08, 1.9607419630386410E+06}; - constexpr CUFINUFFT_FLT c5[] = {1.3593773865640301E+06, 9.1556445104158148E+07, 4.7074012944133490E+08, -1.1192579335657711E+09, -2.1090780087868552E+09, 5.2270306737949314E+09, 1.0058570913473114E-03, -5.2270306737942495E+09, 2.1090780087878082E+09, 1.1192579335658059E+09, -4.7074012944133729E+08, -9.1556445104157895E+07, -1.3593773865640303E+06}; - constexpr CUFINUFFT_FLT c6[] = {6.8417206432039186E+05, 2.1561705510027003E+07, 7.5785249892988410E+06, -2.7456096030230397E+08, 3.4589095671043062E+08, 4.0256106808852541E+08, -1.0074306926606210E+09, 4.0256106809059316E+08, 3.4589095670995283E+08, -2.7456096030234104E+08, 7.5785249893005500E+06, 2.1561705510027427E+07, 6.8417206432039267E+05}; - constexpr CUFINUFFT_FLT c7[] = {2.5248269397037479E+05, 3.0985559672615193E+06, -1.1816517087617906E+07, -8.2958498770340970E+06, 8.0546642347242445E+07, -1.0594657799535300E+08, -4.1868673222825360E-04, 1.0594657799426495E+08, -8.0546642347729877E+07, 8.2958498770339396E+06, 1.1816517087613177E+07, -3.0985559672620757E+06, -2.5248269397037491E+05}; - constexpr CUFINUFFT_FLT c8[] = {6.7530100970876083E+04, 1.2373362326659705E+05, -2.1245597183259744E+06, 5.1047323238916462E+06, -1.4139444405955642E+06, -1.1818267554953648E+07, 2.0121548577168033E+07, -1.1818267556967378E+07, -1.4139444400679788E+06, 5.1047323236808330E+06, -2.1245597183310925E+06, 1.2373362326704434E+05, 6.7530100970875879E+04}; - constexpr CUFINUFFT_FLT c9[] = {1.2421368748960791E+04, -5.0576243646949319E+04, -4.8878193435000605E+04, 6.5307896868984913E+05, -1.5497610128277773E+06, 1.5137725915373438E+06, 2.4159142842753925E-04, -1.5137725925842635E+06, 1.5497610128277773E+06, -6.5307896858028776E+05, 4.8878193437283131E+04, 5.0576243646456518E+04, -1.2421368748960884E+04}; - constexpr CUFINUFFT_FLT c10[] = {1.2904654687546160E+03, -1.1169946055063081E+04, 3.3275109714208906E+04, -3.1765222279764806E+04, -5.9810981980285695E+04, 2.2355863005975721E+05, -3.1083591689740209E+05, 2.2355863472015061E+05, -5.9810982676856896E+04, -3.1765222445615127E+04, 3.3275109711790254E+04, -1.1169946054458416E+04, 1.2904654687550794E+03}; - constexpr CUFINUFFT_FLT c11[] = {-1.9043622268985253E+01, -6.8296542226098870E+02, 4.2702512255472038E+03, -1.2165497337805051E+04, 1.9423733200245264E+04, -1.6010024156865491E+04, -1.8587318864580292E-04, 1.6010021504569266E+04, -1.9423732997327170E+04, 1.2165497443946821E+04, -4.2702512314786209E+03, 6.8296542157807858E+02, 1.9043622268681840E+01}; - constexpr CUFINUFFT_FLT c12[] = {-3.0093984465812213E+01, 9.8972865698526618E+01, -9.7437039087669007E+01, -3.5079927282955276E+02, 1.5699250476860170E+03, -3.1287441993042225E+03, 3.8692185175061472E+03, -3.1287462825609659E+03, 1.5699252631952513E+03, -3.5079945803284346E+02, -9.7437044419281492E+01, 9.8972866145746991E+01, -3.0093984466256714E+01}; - constexpr CUFINUFFT_FLT c13[] = {-4.3050286009571908E+00, 2.1108975820085092E+01, -6.4297196365104938E+01, 1.2922885252832501E+02, -1.6991814421468084E+02, 1.2655005406584399E+02, -2.7552199668252238E-05, -1.2655093214380580E+02, 1.6991796275475141E+02, -1.2922893349406868E+02, 6.4297198822227926E+01, -2.1108976183295965E+01, 4.3050286010617569E+00}; - constexpr CUFINUFFT_FLT c14[] = {-1.0957333744888972E-01, 7.2949316377828033E-01, -3.4300810538238449E+00, 1.0470062030552395E+01, -2.2292087310650142E+01, 3.4570674930666925E+01, -3.9923385381532697E+01, 3.4573472104415345E+01, -2.2292369892227434E+01, 1.0470053799441445E+01, -3.4300825281782954E+00, 7.2949352704193948E-01, -1.0957333730383595E-01}; + constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; + constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; + constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; + constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; + constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; + constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; + constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; + constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; + constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; + constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; + constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; + constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; + constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; + constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; + constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - constexpr CUFINUFFT_FLT c0[] = {1.5499533202966300E+05, 4.4723032442444748E+08, 5.1495083701694786E+10, 1.2904576022918081E+12, 1.1534950432785512E+13, 4.5650102198520516E+13, 8.8830582190032688E+13, 8.8830582190032672E+13, 4.5650102198520516E+13, 1.1534950432785535E+13, 1.2904576022918081E+12, 5.1495083701695145E+10, 4.4723032442444843E+08, 1.5499533202970150E+05}; - constexpr CUFINUFFT_FLT c1[] = {8.9188339002980455E+05, 1.3065352538728631E+09, 9.9400185225815582E+10, 1.7136059013402410E+12, 1.0144146621675832E+13, 2.3034036018490723E+13, 1.4630967270448867E+13, -1.4630967270448859E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402410E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979524E+05}; - constexpr CUFINUFFT_FLT c2[] = {2.3170473769379673E+06, 1.7532505043698246E+09, 8.6523535958354294E+10, 9.7455289065487329E+11, 3.2977972139362295E+12, 1.7874626001697771E+12, -6.1480918082634004E+12, -6.1480918082633994E+12, 1.7874626001697695E+12, 3.2977972139362256E+12, 9.7455289065487366E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr CUFINUFFT_FLT c3[] = {3.6089249230396431E+06, 1.4278058213962190E+09, 4.4296625537022438E+10, 2.9466624630419812E+11, 3.1903621584503357E+11, -9.8834691411254529E+11, -1.1072264714919219E+12, 1.1072264714919253E+12, 9.8834691411255261E+11, -3.1903621584503473E+11, -2.9466624630419775E+11, -4.4296625537022629E+10, -1.4278058213962216E+09, -3.6089249230396645E+06}; - constexpr CUFINUFFT_FLT c4[] = {3.7733555140851741E+06, 7.8376718099107432E+08, 1.4443117772349600E+10, 4.3197433307419121E+10, -7.6585042240582489E+10, -1.8569640140761731E+11, 2.0385335192658878E+11, 2.0385335192657968E+11, -1.8569640140762405E+11, -7.6585042240578430E+10, 4.3197433307418945E+10, 1.4443117772349699E+10, 7.8376718099107552E+08, 3.7733555140852556E+06}; - constexpr CUFINUFFT_FLT c5[] = {2.8079157920112349E+06, 3.0340753492383713E+08, 2.9498136661747351E+09, -6.2820200387927818E+08, -2.2372008390622681E+10, 1.5217518660587118E+10, 4.0682590266889229E+10, -4.0682590266876595E+10, -1.5217518660581694E+10, 2.2372008390624306E+10, 6.2820200387922049E+08, -2.9498136661747746E+09, -3.0340753492383796E+08, -2.8079157920112382E+06}; - constexpr CUFINUFFT_FLT c6[] = {1.5361613559533113E+06, 8.3513615594416350E+07, 3.0077547202707732E+08, -1.3749596754069650E+09, -6.6733027297582805E+08, 5.9590333632825184E+09, -4.3025685566887646E+09, -4.3025685566943264E+09, 5.9590333632825480E+09, -6.6733027297550666E+08, -1.3749596754065177E+09, 3.0077547202710402E+08, 8.3513615594416887E+07, 1.5361613559533583E+06}; - constexpr CUFINUFFT_FLT c7[] = {6.2759409419592936E+05, 1.5741723594963074E+07, -1.5632610223404476E+07, -1.9294824907080847E+08, 4.4643806532363749E+08, 1.5178998383416286E+07, -9.6771139892184162E+08, 9.6771139891756535E+08, -1.5178998386503356E+07, -4.4643806533349395E+08, 1.9294824907058707E+08, 1.5632610223392753E+07, -1.5741723594962660E+07, -6.2759409419590654E+05}; - constexpr CUFINUFFT_FLT c8[] = {1.9151404903933575E+05, 1.7156606891565928E+06, -9.7733523156610541E+06, 4.2982266236283993E+06, 5.1660907884816565E+07, -1.1279400211055294E+08, 6.4701089573887214E+07, 6.4701089567399226E+07, -1.1279400211297083E+08, 5.1660907891780980E+07, 4.2982266233826252E+06, -9.7733523156971950E+06, 1.7156606891561027E+06, 1.9151404903936631E+05}; - constexpr CUFINUFFT_FLT c9[] = {4.2715272622844830E+04, -2.2565910608684317E+03, -1.1769776156829668E+06, 4.0078399908543471E+06, -3.8951858064309461E+06, -5.0944610762301283E+06, 1.6765992441460442E+07, -1.6765992436785825E+07, 5.0944610781778852E+06, 3.8951858054570677E+06, -4.0078399907569592E+06, 1.1769776157156830E+06, 2.2565910609040961E+03, -4.2715272622820310E+04}; - constexpr CUFINUFFT_FLT c10[] = {6.4806786522791654E+03, -3.5474227032931303E+04, 1.8237100723206047E+04, 3.0934714627485734E+05, -1.0394703921956274E+06, 1.4743920336239333E+06, -7.3356882129423053E+05, -7.3356882916659222E+05, 1.4743920340662012E+06, -1.0394703928590287E+06, 3.0934714634119731E+05, 1.8237100680361433E+04, -3.5474227032996088E+04, 6.4806786523011797E+03}; - constexpr CUFINUFFT_FLT c11[] = {4.9913632908432180E+02, -5.5416668526903932E+03, 2.0614058707628108E+04, -3.2285139177838235E+04, -5.3099560012237780E+03, 1.1559000312360718E+05, -2.2569743818692098E+05, 2.2569743267254104E+05, -1.1559000606061178E+05, 5.3099530192621614E+03, 3.2285139062955688E+04, -2.0614058671415001E+04, 5.5416668535488525E+03, -4.9913632906175445E+02}; - constexpr CUFINUFFT_FLT c12[] = {-3.3076333188770995E+01, -1.8970588549665433E+02, 1.8160423465108606E+03, -6.3715702906684537E+03, 1.2525623712293716E+04, -1.4199809613604592E+04, 6.4441857815348694E+03, 6.4441852068443368E+03, -1.4199811050333730E+04, 1.2525626046977848E+04, -6.3715705510753096E+03, 1.8160422724294601E+03, -1.8970588700494130E+02, -3.3076333169380085E+01}; - constexpr CUFINUFFT_FLT c13[] = {-1.4394533627757088E+01, 5.7000699312246105E+01, -1.0101141802233408E+02, -3.2954042015367456E+01, 6.1417873351558330E+02, -1.6177281811377129E+03, 2.4593356854220169E+03, -2.4593356782637338E+03, 1.6177289006539679E+03, -6.1417987494681950E+02, 3.2954142200289709E+01, 1.0101142888658896E+02, -5.7000698890466253E+01, 1.4394533639134110E+01}; - constexpr CUFINUFFT_FLT c14[] = {-1.5925952286169334E+00, 8.5113929411519127E+00, -2.8993517494090959E+01, 6.6373419665690747E+01, -1.0329523947888029E+02, 1.0280172537525394E+02, -4.3894765605046906E+01, -4.3897466711581743E+01, 1.0280269421314661E+02, -1.0329529425338121E+02, 6.6373405476301841E+01, -2.8993535416845578E+01, 8.5113925602355138E+00, -1.5925952196632756E+00}; - constexpr CUFINUFFT_FLT c15[] = {1.5984868375087002E-02, 1.2876155307218357E-01, -9.8359379953002779E-01, 3.7711056267887488E+00, -9.4307026856950991E+00, 1.6842022255882348E+01, -2.2310401016395307E+01, 2.2307954998498516E+01, -1.6843279237301534E+01, 9.4308852877255891E+00, -3.7711056267887488E+00, 9.8361025494556609E-01, -1.2876093931172500E-01, -1.5984859319657936E-02}; + constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; + constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; + constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; + constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; + constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; + constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; + constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; + constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; + constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; + constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; + constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; + constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; + constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; + constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; + constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - constexpr CUFINUFFT_FLT c0[] = {2.3939707792241831E+05, 9.7700272582690263E+08, 1.4715933396485272E+11, 4.7242424833337188E+12, 5.3987426629953602E+13, 2.7580474290566097E+14, 7.0693378336533425E+14, 9.6196578554477812E+14, 7.0693378336533450E+14, 2.7580474290566138E+14, 5.3987426629953812E+13, 4.7242424833337275E+12, 1.4715933396485272E+11, 9.7700272582690227E+08, 2.3939707792241947E+05}; - constexpr CUFINUFFT_FLT c1[] = {1.4314487885226030E+06, 2.9961416925358467E+09, 3.0273361232748438E+11, 6.8507333793903594E+12, 5.4192702756911008E+13, 1.7551587948105312E+14, 2.1874615668430153E+14, 7.1650878467891699E-02, -2.1874615668430153E+14, -1.7551587948105331E+14, -5.4192702756911164E+13, -6.8507333793903701E+12, -3.0273361232748431E+11, -2.9961416925358462E+09, -1.4314487885226023E+06}; - constexpr CUFINUFFT_FLT c2[] = {3.8829497354762917E+06, 4.2473082696966438E+09, 2.8414312556015527E+11, 4.3688281331121411E+12, 2.1823119508000523E+13, 3.2228098609392012E+13, -2.1833085454691871E+13, -7.3750710225100922E+13, -2.1833085454691941E+13, 3.2228098609392000E+13, 2.1823119508000590E+13, 4.3688281331121475E+12, 2.8414312556015521E+11, 4.2473082696966453E+09, 3.8829497354762908E+06}; - constexpr CUFINUFFT_FLT c3[] = {6.3495763451755773E+06, 3.6841035003733954E+09, 1.5965774278321045E+11, 1.5630338683778196E+12, 3.8749058615819282E+12, -2.7319740087723496E+12, -1.3233342822865416E+13, 1.2094759019991106E-03, 1.3233342822865408E+13, 2.7319740087723706E+12, -3.8749058615819390E+12, -1.5630338683778196E+12, -1.5965774278321036E+11, -3.6841035003733935E+09, -6.3495763451755773E+06}; - constexpr CUFINUFFT_FLT c4[] = {7.0146619045520434E+06, 2.1782897863065772E+09, 5.8897780310148148E+10, 3.1953009601770453E+11, 4.0651527030852091E+08, -1.6379148273275527E+12, -1.1568753136999574E+11, 2.7451653250461855E+12, -1.1568753137002715E+11, -1.6379148273276675E+12, 4.0651527030276263E+08, 3.1953009601770386E+11, 5.8897780310148087E+10, 2.1782897863065767E+09, 7.0146619045520416E+06}; - constexpr CUFINUFFT_FLT c5[] = {5.5580012413990181E+06, 9.2345162185944223E+08, 1.4522950934020067E+10, 2.7025952371212223E+10, -1.2304576967641710E+11, -1.0116752717201025E+11, 3.8517418245457495E+11, 1.1720185410178396E-01, -3.8517418245448737E+11, 1.0116752717220248E+11, 1.2304576967643900E+11, -2.7025952371215157E+10, -1.4522950934020073E+10, -9.2345162185944128E+08, -5.5580012413990190E+06}; - constexpr CUFINUFFT_FLT c6[] = {3.2693972344231787E+06, 2.8610260147425157E+08, 2.2348528403750129E+09, -3.4574515574239435E+09, -1.7480626463586948E+10, 3.1608597465528339E+10, 1.9879262560041798E+10, -6.6148013553832657E+10, 1.9879262560029728E+10, 3.1608597465497307E+10, -1.7480626463581020E+10, -3.4574515574192748E+09, 2.2348528403750839E+09, 2.8610260147425318E+08, 3.2693972344231806E+06}; - constexpr CUFINUFFT_FLT c7[] = {1.4553539959296260E+06, 6.4136842048383795E+07, 1.3622336582061595E+08, -1.2131510424646864E+09, 6.4322366984170294E+08, 4.5078753872136936E+09, -7.1689413747181644E+09, -1.1786171556070136E-02, 7.1689413746620741E+09, -4.5078753875125484E+09, -6.4322366985783029E+08, 1.2131510424602287E+09, -1.3622336582069945E+08, -6.4136842048384361E+07, -1.4553539959296270E+06}; - constexpr CUFINUFFT_FLT c8[] = {4.9358776531681529E+05, 9.7772970960589685E+06, -2.3511574237970300E+07, -1.0142613816602133E+08, 3.9421144218642426E+08, -2.8449115593954617E+08, -5.7549243245203042E+08, 1.1608781631399941E+09, -5.7549243247572994E+08, -2.8449115597919518E+08, 3.9421144214433813E+08, -1.0142613816466759E+08, -2.3511574237996321E+07, 9.7772970960581861E+06, 4.9358776531681448E+05}; - constexpr CUFINUFFT_FLT c9[] = {1.2660319987326673E+05, 7.7519511328176421E+05, -6.5244610661542173E+06, 9.0878257489026226E+06, 2.3116605620370809E+07, -8.7079594480778053E+07, 9.5542733720576629E+07, 4.2723164545317951E-02, -9.5542733670714036E+07, 8.7079594586736053E+07, -2.3116605561938088E+07, -9.0878257517268714E+06, 6.5244610661359569E+06, -7.7519511328043276E+05, -1.2660319987326747E+05}; - constexpr CUFINUFFT_FLT c10[] = {2.3793325531458449E+04, -4.2305332803592217E+04, -5.2884156986641441E+05, 2.5307340140247596E+06, -4.0404175229102052E+06, -1.7519991511035681E+05, 1.0146438775036881E+07, -1.5828545434039038E+07, 1.0146438771144925E+07, -1.7520004460626876E+05, -4.0404175749208611E+06, 2.5307340154400147E+06, -5.2884156982771575E+05, -4.2305332803462676E+04, 2.3793325531458788E+04}; - constexpr CUFINUFFT_FLT c11[] = {2.9741655196842516E+03, -2.0687056404176896E+04, 3.3295507782231041E+04, 1.0661145714339131E+05, -5.6644238113375264E+05, 1.0874811579280477E+06, -9.6561272951275646E+05, -5.1287199081408294E-03, 9.6561272024221742E+05, -1.0874812519522079E+06, 5.6644242684715183E+05, -1.0661145918131116E+05, -3.3295507839673090E+04, 2.0687056403552484E+04, -2.9741655196846054E+03}; - constexpr CUFINUFFT_FLT c12[] = {1.5389176594851995E+02, -2.3864418514303975E+03, 1.0846266940782971E+04, -2.2940053288728755E+04, 1.4780109856545603E+04, 4.2663625334078126E+04, -1.3047651001642903E+05, 1.7468402233671257E+05, -1.3047651921148783E+05, 4.2663543727874072E+04, 1.4780033422571960E+04, -2.2940053360564565E+04, 1.0846266911599001E+04, -2.3864418523423406E+03, 1.5389176594715920E+02}; - constexpr CUFINUFFT_FLT c13[] = {-2.3857631312189291E+01, -1.9651605604649610E+01, 6.4183085202559698E+02, -2.8648428618202479E+03, 6.8249256924540387E+03, -9.7944454945500202E+03, 7.6177717113307281E+03, 1.2047808031005401E-02, -7.6177543637173221E+03, 9.7944303211006554E+03, -6.8249067869823548E+03, 2.8648410033462715E+03, -6.4183084900019139E+02, 1.9651606442715156E+01, 2.3857631312384541E+01}; - constexpr CUFINUFFT_FLT c14[] = {-6.1348505741956316E+00, 2.7872916029950378E+01, -6.5819949282243059E+01, 5.1366943137229264E+01, 1.7214074364107390E+02, -6.9658313160417026E+02, 1.3192072946885612E+03, -1.6053709652649356E+03, 1.3192033489278531E+03, -6.9663899461741221E+02, 1.7211498258980890E+02, 5.1367587332701412E+01, -6.5819942079787495E+01, 2.7872915852722411E+01, -6.1348505745937754E+00}; - constexpr CUFINUFFT_FLT c15[] = {-4.9671584494050897E-01, 3.0617548962871655E+00, -1.1650680501534040E+01, 3.0081518778147480E+01, -5.4027643304315461E+01, 6.6072752684824721E+01, -4.7155420133398515E+01, -5.6540863480770403E-03, 4.7158681490594240E+01, -6.6050534688928863E+01, 5.4059169757207428E+01, -3.0081909461561551E+01, 1.1650669885136919E+01, -3.0617550621683702E+00, 4.9671584460032286E-01}; - constexpr CUFINUFFT_FLT c16[] = {4.3460787769280373E-03, -1.3199805974685097E-02, -1.9413550415167488E-01, 1.1330353009743728E+00, -3.4412627904689330E+00, 7.1628360506506050E+00, -1.1104833360853762E+01, 1.2402582581952625E+01, -1.1114919494696498E+01, 7.0930736249049993E+00, -3.4864402649728556E+00, 1.1323392526753271E+00, -1.9415335680557039E-01, -1.3200242030886846E-02, 4.3460779753541788E-03}; + constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; + constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; + constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; + constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; + constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; + constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; + constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; + constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; + constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; + constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; + constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; + constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; + constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; + constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; + constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; + constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; + constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - constexpr CUFINUFFT_FLT c0[] = {3.6434551345571090E+05, 2.0744705928579485E+09, 4.0355760945670044E+11, 1.6364575388763037E+13, 2.3514830376056556E+14, 1.5192201717462535E+15, 4.9956173084674140E+15, 8.9287666945127430E+15, 8.9287666945127430E+15, 4.9956173084674140E+15, 1.5192201717462535E+15, 2.3514830376056556E+14, 1.6364575388763041E+13, 4.0355760945670050E+11, 2.0744705928579490E+09, 3.6434551345570857E+05}; - constexpr CUFINUFFT_FLT c1[] = {2.2576246485480363E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131055E+13, 2.6313738449330159E+14, 1.1495095100701462E+15, 2.1932582707747572E+15, 1.2860244365132600E+15, -1.2860244365132588E+15, -2.1932582707747572E+15, -1.1495095100701462E+15, -2.6313738449330169E+14, -2.5606844387131066E+13, -8.7873753526056323E+11, -6.6499571180086451E+09, -2.2576246485480368E+06}; - constexpr CUFINUFFT_FLT c2[] = {6.3730995546265068E+06, 9.9060026035198040E+09, 8.8097248605448987E+11, 1.7953384130753676E+13, 1.2398425545001648E+14, 3.0749346493041212E+14, 1.0259777520247089E+14, -5.5291976457534288E+14, -5.5291976457534375E+14, 1.0259777520247070E+14, 3.0749346493041225E+14, 1.2398425545001656E+14, 1.7953384130753684E+13, 8.8097248605449011E+11, 9.9060026035198078E+09, 6.3730995546265068E+06}; - constexpr CUFINUFFT_FLT c3[] = {1.0896915393078225E+07, 9.0890343524593887E+09, 5.3565169504010028E+11, 7.3004206720038701E+12, 2.9692333044160082E+13, 1.6051737468109645E+13, -9.1273329108089531E+13, -8.5999306918502797E+13, 8.5999306918501641E+13, 9.1273329108090062E+13, -1.6051737468109594E+13, -2.9692333044160074E+13, -7.3004206720038711E+12, -5.3565169504010034E+11, -9.0890343524593887E+09, -1.0896915393078221E+07}; - constexpr CUFINUFFT_FLT c4[] = {1.2655725616100587E+07, 5.7342804054544201E+09, 2.1822836608899588E+11, 1.8300700858999731E+12, 2.7770431049858564E+12, -8.5034969223847109E+12, -1.2846668467422201E+13, 1.6519076896574611E+13, 1.6519076896573730E+13, -1.2846668467421688E+13, -8.5034969223849521E+12, 2.7770431049858491E+12, 1.8300700858999692E+12, 2.1822836608899588E+11, 5.7342804054544220E+09, 1.2655725616100591E+07}; - constexpr CUFINUFFT_FLT c5[] = {1.0609303958036324E+07, 2.6255609052371726E+09, 6.1673589426039383E+10, 2.6044432099084976E+11, -3.5431628074578320E+11, -1.6077602129636006E+12, 1.5534405614729011E+12, 2.8019935380861670E+12, -2.8019935380844810E+12, -1.5534405614727644E+12, 1.6077602129636335E+12, 3.5431628074576636E+11, -2.6044432099085037E+11, -6.1673589426039368E+10, -2.6255609052371726E+09, -1.0609303958036324E+07}; - constexpr CUFINUFFT_FLT c6[] = {6.6544809363384563E+06, 8.9490403680928385E+08, 1.1882638725190760E+10, 8.1552898137784090E+09, -1.2575562817891687E+11, 2.7074695075842178E+10, 3.9453789461922034E+11, -3.1679644857435541E+11, -3.1679644857440692E+11, 3.9453789461951154E+11, 2.7074695076007500E+10, -1.2575562817885344E+11, 8.1552898137852116E+09, 1.1882638725191153E+10, 8.9490403680928493E+08, 6.6544809363384582E+06}; - constexpr CUFINUFFT_FLT c7[] = {3.1906872142824987E+06, 2.2785946180651781E+08, 1.3744578972809656E+09, -4.3997172592913818E+09, -9.2011130754125404E+09, 3.4690551711826530E+10, -9.4227043395316906E+09, -5.9308465069991577E+10, 5.9308465068943581E+10, 9.4227043392705956E+09, -3.4690551712022408E+10, 9.2011130753675175E+09, 4.3997172592866106E+09, -1.3744578972812984E+09, -2.2785946180652174E+08, -3.1906872142824973E+06}; - constexpr CUFINUFFT_FLT c8[] = {1.1821527096621725E+06, 4.2281234059839047E+07, 2.8723226058821958E+07, -8.3553955857311106E+08, 1.2447304829054153E+09, 2.1955280944846683E+09, -7.0514195725593920E+09, 4.3745141235010500E+09, 4.3745141236655197E+09, -7.0514195727234411E+09, 2.1955280942826533E+09, 1.2447304829048812E+09, -8.3553955857841730E+08, 2.8723226058853466E+07, 4.2281234059838966E+07, 1.1821527096621748E+06}; - constexpr CUFINUFFT_FLT c9[] = {3.3854610744280228E+05, 5.2176984975098642E+06, -2.0677283564981934E+07, -3.5831818966960624E+07, 2.6599346104854527E+08, -3.7992777983589816E+08, -1.3426914439904341E+08, 9.1752051209279442E+08, -9.1752051188087845E+08, 1.3426914452369988E+08, 3.7992777987329507E+08, -2.6599346107659298E+08, 3.5831818968129277E+07, 2.0677283565073237E+07, -5.2176984975084374E+06, -3.3854610744280077E+05}; - constexpr CUFINUFFT_FLT c10[] = {7.3893334077309293E+04, 2.6983804209740972E+05, -3.6415998560880083E+06, 8.4025485863333493E+06, 4.9278860779347531E+06, -5.1437033824108891E+07, 8.7603898602732122E+07, -4.6199497846299231E+07, -4.6199498219926819E+07, 8.7603898832003579E+07, -5.1437033801464774E+07, 4.9278861005788362E+06, 8.4025485870409794E+06, -3.6415998559663831E+06, 2.6983804209585470E+05, 7.3893334077307591E+04}; - constexpr CUFINUFFT_FLT c11[] = {1.1778892113374410E+04, -4.0077190109195144E+04, -1.8372552183899941E+05, 1.3262878359201169E+06, -2.9738540144900386E+06, 1.9493508843214174E+06, 4.1881949043266159E+06, -1.1066749441324197E+07, 1.1066749225224417E+07, -4.1881949989500660E+06, -1.9493509811827433E+06, 2.9738539876374160E+06, -1.3262878392766861E+06, 1.8372552166916840E+05, 4.0077190106541901E+04, -1.1778892113374635E+04}; - constexpr CUFINUFFT_FLT c12[] = {1.2019749667905517E+03, -1.0378455845905968E+04, 2.6333352626226591E+04, 1.7117060824677988E+04, -2.5133287788479996E+05, 6.4713912423136400E+05, -8.1634971996757365E+05, 3.8623850687193515E+05, 3.8623887467457692E+05, -8.1634999581952032E+05, 6.4713888515965885E+05, -2.5133289397614688E+05, 1.7117056658162492E+04, 2.6333352590306949E+04, -1.0378455846607170E+04, 1.2019749667886601E+03}; - constexpr CUFINUFFT_FLT c13[] = {3.1189837633271310E+01, -8.9083493666530228E+02, 4.9454294721013366E+03, -1.3124691362129612E+04, 1.5834782149156119E+04, 6.9607783053915546E+03, -5.9789949050326162E+04, 1.0841720290002371E+05, -1.0841726183381994E+05, 5.9790023686287932E+04, -6.9607416211385053E+03, -1.5834800728954084E+04, 1.3124692508510609E+04, -4.9454294244132070E+03, 8.9083493795553227E+02, -3.1189837630675466E+01}; - constexpr CUFINUFFT_FLT c14[] = {-1.2975319073318561E+01, 1.8283698900397550E+01, 1.7684013462935113E+02, -1.1059907069976271E+03, 3.1998196269059799E+03, -5.5988285845467362E+03, 5.9248624962359208E+03, -2.5987075415506133E+03, -2.5989297031998472E+03, 5.9249309327755627E+03, -5.5988287659129119E+03, 3.1998292347735460E+03, -1.1059914993060199E+03, 1.7684017599586443E+02, 1.8283697951655380E+01, -1.2975319075406015E+01}; - constexpr CUFINUFFT_FLT c15[] = {-2.3155118737567935E+00, 1.1938503501764195E+01, -3.4150613932459848E+01, 4.8896713096147266E+01, 1.5844216816345641E+01, -2.4277080939345015E+02, 6.0146058115394737E+02, -8.8748160721868635E+02, 8.8732832343048744E+02, -6.0146927810646923E+02, 2.4275722040513463E+02, -1.5849652411671842E+01, -4.8897528435446198E+01, 3.4150596946224454E+01, -1.1938504032584051E+01, 2.3155118728820292E+00}; - constexpr CUFINUFFT_FLT c16[] = {-1.5401723736175238E-01, 9.8067757197686212E-01, -4.1901188293318530E+00, 1.2150691895619683E+01, -2.4764820628534302E+01, 3.6081462800085532E+01, -3.4534922277532473E+01, 1.2910251318703700E+01, 1.3098525817101535E+01, -3.4588714991360455E+01, 3.5973877372429698E+01, -2.4775747273530602E+01, 1.2149010873312557E+01, -4.1901467369287460E+00, 9.8067700766883559E-01, -1.5401723876450651E-01}; - constexpr CUFINUFFT_FLT c17[] = {1.1808835457017667E-02, -2.5443945538745794E-02, -1.3157119144786456E-04, 2.5877310634925382E-01, -1.0920774586473376E+00, 2.6473618304294715E+00, -4.4448325935254926E+00, 6.8292491990998831E+00, -6.8300632710034588E+00, 4.4643703192113184E+00, -2.6384070394901351E+00, 1.0890246890089277E+00, -2.5849326913239973E-01, 1.4031610447463365E-04, 2.5444280926035151E-02, -1.1808834729180664E-02}; + constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; + constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; + constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; + constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; + constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; + constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; + constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; + constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; + constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; + constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; + constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; + constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; + constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; + constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; + constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; + constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; + constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; + constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc new file mode 100644 index 000000000..358a1bdbf --- /dev/null +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -0,0 +1,192 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {2.3711015472112535E+01, 2.3711015472112539E+01}; + constexpr FLT c1[] = {2.5079742199350566E+01, -2.5079742199350566E+01}; + constexpr FLT c2[] = {-3.5023281580177019E+00, -3.5023281580177028E+00}; + constexpr FLT c3[] = {-7.3894949249195596E+00, 7.3894949249195649E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {5.9620016143346866E+01, 2.4110216701187517E+02, 5.9620016148621886E+01}; + constexpr FLT c1[] = {9.7575520958604287E+01, 6.0625609804989280E-15, -9.7575520952908548E+01}; + constexpr FLT c2[] = {3.5838417859768519E+01, -7.3472145274965385E+01, 3.5838417865129472E+01}; + constexpr FLT c3[] = {-1.0721643298166459E+01, 2.2269719700859066E-14, 1.0721643303220411E+01}; + constexpr FLT c4[] = {-7.0570630207138105E+00, 9.1538553399011651E+00, -7.0570630151506615E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==4) { + constexpr FLT c0[] = {1.2612470018753703E+02, 1.1896204292999123E+03, 1.1896204292999125E+03, 1.2612470018753706E+02}; + constexpr FLT c1[] = {2.6158034850676631E+02, 5.6161104654809833E+02, -5.6161104654809833E+02, -2.6158034850676631E+02}; + constexpr FLT c2[] = {1.7145379463699527E+02, -1.6695967127766502E+02, -1.6695967127766531E+02, 1.7145379463699518E+02}; + constexpr FLT c3[] = {2.3525961965887934E+01, -1.0057439659768855E+02, 1.0057439659768869E+02, -2.3525961965887870E+01}; + constexpr FLT c4[] = {-1.5608307370340814E+01, 9.5627412100261218E+00, 9.5627412100261768E+00, -1.5608307370340912E+01}; + constexpr FLT c5[] = {-4.5715207776748672E+00, 7.9904373067896399E+00, -7.9904373067894170E+00, 4.5715207776748832E+00}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==5) { + constexpr FLT c0[] = {2.4106943677442635E+02, 4.3538384278025578E+03, 9.3397486707382068E+03, 4.3538384278025542E+03, 2.4106943677442635E+02}; + constexpr FLT c1[] = {5.8781364250328284E+02, 3.4742855804122032E+03, -2.2247045611533172E-13, -3.4742855804122019E+03, -5.8781364250328272E+02}; + constexpr FLT c2[] = {5.1234107167555874E+02, 3.5219546517037230E+02, -1.7076861141633149E+03, 3.5219546517037259E+02, 5.1234107167555862E+02}; + constexpr FLT c3[] = {1.7540956907856085E+02, -3.5792356187777011E+02, 1.0950032210404113E-12, 3.5792356187777193E+02, -1.7540956907856062E+02}; + constexpr FLT c4[] = {-2.1768066955080412E-01, -7.8322173187697160E+01, 1.3904039464934533E+02, -7.8322173187696521E+01, -2.1768066955089899E-01}; + constexpr FLT c5[] = {-1.4207955403641282E+01, 1.6019466986222039E+01, 6.2864597222035853E-14, -1.6019466986221275E+01, 1.4207955403641282E+01}; + constexpr FLT c6[] = {-2.1966493586752702E+00, 5.0672636163198259E+00, -6.7340544905090631E+00, 5.0672636163192113E+00, -2.1966493586753031E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==6) { + constexpr FLT c0[] = {4.3011762559089192E+02, 1.3368828836127082E+04, 4.9861340433371268E+04, 4.9861340433371290E+04, 1.3368828836127082E+04, 4.3011762559835182E+02}; + constexpr FLT c1[] = {1.1857225840065146E+03, 1.4112553227730619E+04, 1.5410005180819442E+04, -1.5410005180819426E+04, -1.4112553227730617E+04, -1.1857225839984601E+03}; + constexpr FLT c2[] = {1.2460481448413077E+03, 4.3127030215084988E+03, -5.5438591621431215E+03, -5.5438591621431233E+03, 4.3127030215084969E+03, 1.2460481448488895E+03}; + constexpr FLT c3[] = {6.0825549344387821E+02, -3.4106010789546866E+02, -1.9775725023673151E+03, 1.9775725023673224E+03, 3.4106010789547190E+02, -6.0825549343673049E+02}; + constexpr FLT c4[] = {1.1264961069783713E+02, -3.9740822717990801E+02, 2.7557540616463564E+02, 2.7557540616463149E+02, -3.9740822717990505E+02, 1.1264961070570472E+02}; + constexpr FLT c5[] = {-1.5387906304333869E+01, -3.2640579296386335E+01, 1.1683718215647407E+02, -1.1683718215647050E+02, 3.2640579296386335E+01, 1.5387906311562686E+01}; + constexpr FLT c6[] = {-9.3947198873910107E+00, 1.5069930500884340E+01, -8.0900452409585597E+00, -8.0900452409573536E+00, 1.5069930500885983E+01, -9.3947198802582648E+00}; + constexpr FLT c7[] = {-5.6048841964528473E-01, 2.3377422080932533E+00, -4.2391567591829169E+00, 4.2391567591861783E+00, -2.3377422080911803E+00, 5.6048842664328347E-01}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==7) { + constexpr FLT c0[] = {7.2950392616203362E+02, 3.6439117038309523E+04, 2.1220891582018451E+05, 3.6180058567561547E+05, 2.1220891582018466E+05, 3.6439117038309538E+04, 7.2950392617434579E+02}; + constexpr FLT c1[] = {2.2197790785452585E+03, 4.6392067080426263E+04, 1.1568051746995676E+05, -2.6471374827810822E-11, -1.1568051746995673E+05, -4.6392067080426248E+04, -2.2197790785319785E+03}; + constexpr FLT c2[] = {2.6796845075663950E+03, 2.0921129984587253E+04, 3.9399551345633640E+01, -4.7251335435527413E+04, 3.9399551345568185E+01, 2.0921129984587242E+04, 2.6796845075789138E+03}; + constexpr FLT c3[] = {1.6253748990844513E+03, 2.6138488347211651E+03, -1.0037546705421486E+04, 4.9207207296884551E-11, 1.0037546705421528E+04, -2.6138488347211514E+03, -1.6253748990726617E+03}; + constexpr FLT c4[] = {4.9106375852553407E+02, -8.6668269315415375E+02, -1.0513434716617946E+03, 2.8444456471590820E+03, -1.0513434716617835E+03, -8.6668269315414682E+02, 4.9106375853851517E+02}; + constexpr FLT c5[] = {4.0739167949763470E+01, -2.8515155742293291E+02, 3.9930326803802245E+02, 9.3897520950192402E-12, -3.9930326803800614E+02, 2.8515155742293899E+02, -4.0739167937836122E+01}; + constexpr FLT c6[] = {-1.7148987139838134E+01, 7.5799002551925454E-01, 6.3260304953181709E+01, -1.0529869309159973E+02, 6.3260304953170241E+01, 7.5799002552861849E-01, -1.7148987128070043E+01}; + constexpr FLT c7[] = {-4.5424411501048008E+00, 9.8749254058339080E+00, -9.6456179777422530E+00, 1.4220101775868667E-11, 9.6456179778363111E+00, -9.8749254058241132E+00, 4.5424411616515830E+00}; + constexpr FLT c8[] = {-5.0793946806705008E-02, 7.3273813711596381E-01, -2.0117140545159620E+00, 2.6999257940738310E+00, -2.0117140545257630E+00, 7.3273813712090197E-01, -5.0793935652734865E-02}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==8) { + constexpr FLT c0[] = {1.1895823653767156E+03, 9.0980236725237002E+04, 7.7438826909537544E+05, 2.0077596413122714E+06, 2.0077596413122721E+06, 7.7438826909537590E+05, 9.0980236725237002E+04, 1.1895823653767152E+03}; + constexpr FLT c1[] = {3.9313191526977803E+03, 1.3318570706800825E+05, 5.7275848637687659E+05, 4.6250273225257988E+05, -4.6250273225258006E+05, -5.7275848637687659E+05, -1.3318570706800825E+05, -3.9313191526977798E+03}; + constexpr FLT c2[] = {5.2976026193612415E+03, 7.5628970871188474E+04, 1.0073339198368331E+05, -1.8165150843791279E+05, -1.8165150843791300E+05, 1.0073339198368324E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; + constexpr FLT c3[] = {3.7552239608473869E+03, 1.8376340228970930E+04, -2.3878081117551392E+04, -4.6296734056047753E+04, 4.6296734056048466E+04, 2.3878081117551716E+04, -1.8376340228970901E+04, -3.7552239608473869E+03}; + constexpr FLT c4[] = {1.4742862505418659E+03, 1.2842168112180084E+02, -9.1969665138397813E+03, 7.5990739935236888E+03, 7.5990739935236415E+03, -9.1969665138397813E+03, 1.2842168112182003E+02, 1.4742862505418657E+03}; + constexpr FLT c5[] = {2.8158981009344376E+02, -8.8613607108855138E+02, 5.3457145342334591E+01, 2.1750989694613118E+03, -2.1750989694611812E+03, -5.3457145342138865E+01, 8.8613607108855138E+02, -2.8158981009344376E+02}; + constexpr FLT c6[] = {-1.4786862436220549E+00, -1.3935442261829297E+02, 3.2599325739090762E+02, -1.9541889343354751E+02, -1.9541889343356968E+02, 3.2599325739086612E+02, -1.3935442261828183E+02, -1.4786862436238759E+00}; + constexpr FLT c7[] = {-1.1542034522900533E+01, 1.2000512051415985E+01, 1.9687328710253290E+01, -6.3962883082497100E+01, 6.3962883082831397E+01, -1.9687328710065113E+01, -1.2000512051397745E+01, 1.1542034522901620E+01}; + constexpr FLT c8[] = {-1.7448292513541994E+00, 4.8577330433876664E+00, -6.8794163043749101E+00, 3.4611708986529197E+00, 3.4611708984979552E+00, -6.8794163042722616E+00, 4.8577330434089125E+00, -1.7448292513539221E+00}; + constexpr FLT c9[] = {1.5044951479000782E-01, 9.6230159355094672E-02, -7.0399250408500635E-01, 1.3251401130885254E+00, -1.3251401130188682E+00, 7.0399250409661596E-01, -9.6230159344936325E-02, -1.5044951478914617E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.8793738965777031E+03, 2.1220891582018440E+05, 2.5233246441351655E+06, 9.2877384983420707E+06, 1.4015330434461467E+07, 9.2877384983420800E+06, 2.5233246441351655E+06, 2.1220891582018536E+05, 1.8793738965777065E+03}; + constexpr FLT c1[] = {6.6675066501609354E+03, 3.4704155240987014E+05, 2.2890184838322564E+06, 3.8705035445351237E+06, 1.1717532248112299E-10, -3.8705035445351265E+06, -2.2890184838322559E+06, -3.4704155240987102E+05, -6.6675066501609354E+03}; + constexpr FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092282E+05, -2.1140963571671949E+05, -1.4236515118873832E+06, -2.1140963571672430E+05, 6.8167589492092212E+05, 2.3171563090202416E+05, 9.8412775404612275E+03}; + constexpr FLT c3[] = {7.8762358364031061E+03, 7.6500585979636191E+04, 1.2434778984075345E+04, -2.8572091469429957E+05, 1.1900185890455270E-09, 2.8572091469430370E+05, -1.2434778984074723E+04, -7.6500585979636191E+04, -7.8762358364031033E+03}; + constexpr FLT c4[] = {3.6941911906762075E+03, 9.9232929169976032E+03, -3.3472877669901907E+04, -1.4082384858050133E+04, 6.7911966136974472E+04, -1.4082384858045889E+04, -3.3472877669901856E+04, 9.9232929169977433E+03, 3.6941911906762098E+03}; + constexpr FLT c5[] = {9.8900189723050323E+02, -1.2736589324621348E+03, -5.0407308390125609E+03, 9.8914296140178049E+03, 6.1223023135982708E-10, -9.8914296140230235E+03, 5.0407308390128219E+03, 1.2736589324621673E+03, -9.8900189723050403E+02}; + constexpr FLT c6[] = {1.1165868717716108E+02, -5.9057035448559543E+02, 5.5860705835625356E+02, 9.1996097522935008E+02, -2.0290255886368843E+03, 9.1996097522906575E+02, 5.5860705835607132E+02, -5.9057035448565603E+02, 1.1165868717715755E+02}; + constexpr FLT c7[] = {-1.3142584300867490E+01, -4.2852762793261455E+01, 1.8188640945803897E+02, -2.1362000457586478E+02, 1.1194928851903786E-10, 2.1362000457739751E+02, -1.8188640945787162E+02, 4.2852762793424958E+01, 1.3142584300868396E+01}; + constexpr FLT c8[] = {-5.8088068374876212E+00, 1.0201832931297655E+01, -3.5220973552653217E-01, -2.6632420897260161E+01, 4.2737607183076172E+01, -2.6632420895005694E+01, -3.5220973526763744E-01, 1.0201832931314263E+01, -5.8088068374874551E+00}; + constexpr FLT c9[] = {-4.0642645973149144E-01, 1.8389772328590479E+00, -3.5549484956004700E+00, 3.2273562224626624E+00, 2.3066481718890602E-10, -3.2273562263634674E+00, 3.5549484956933464E+00, -1.8389772328126097E+00, 4.0642645973247782E-01}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.8923571298063644E+03, 4.6856831608341972E+05, 7.5304732752870098E+06, 3.7576537584215805E+07, 7.9591606307847947E+07, 7.9591606307847947E+07, 3.7576537584215775E+07, 7.5304732752870088E+06, 4.6856831608341815E+05, 2.8923571298063584E+03}; + constexpr FLT c1[] = {1.0919387804943195E+04, 8.3976685277206486E+05, 7.9494027659552386E+06, 2.1606786285174560E+07, 1.4625897641453253E+07, -1.4625897641453268E+07, -2.1606786285174556E+07, -7.9494027659552386E+06, -8.3976685277206241E+05, -1.0919387804943173E+04}; + constexpr FLT c2[] = {1.7418455635504146E+04, 6.3489952164419868E+05, 3.1358985409389907E+06, 2.2547438801903715E+06, -6.0429762783920690E+06, -6.0429762783920504E+06, 2.2547438801903636E+06, 3.1358985409389869E+06, 6.3489952164419682E+05, 1.7418455635504106E+04}; + constexpr FLT c3[] = {1.5396188098732166E+04, 2.5490607173283477E+05, 4.2818880748176732E+05, -9.5435463094349112E+05, -1.2004850139039194E+06, 1.2004850139039543E+06, 9.5435463094349764E+05, -4.2818880748176464E+05, -2.5490607173283392E+05, -1.5396188098732144E+04}; + constexpr FLT c4[] = {8.2616700456447434E+03, 5.2880641964112423E+04, -6.1165055141129313E+04, -2.1590299490710214E+05, 2.1595822052158226E+05, 2.1595822052158433E+05, -2.1590299490713206E+05, -6.1165055141130644E+04, 5.2880641964112234E+04, 8.2616700456447343E+03}; + constexpr FLT c5[] = {2.7267169079066489E+03, 2.4572549134030178E+03, -2.6065821571076271E+04, 1.3919259807562572E+04, 4.6802084705703302E+04, -4.6802084705714791E+04, -1.3919259807544826E+04, 2.6065821571078101E+04, -2.4572549134029523E+03, -2.7267169079066462E+03}; + constexpr FLT c6[] = {5.0402062537834655E+02, -1.3640153425625094E+03, -1.4063198459010243E+03, 7.0858129627832977E+03, -4.8375233777539070E+03, -4.8375233777688618E+03, 7.0858129627894568E+03, -1.4063198459013925E+03, -1.3640153425628407E+03, 5.0402062537833399E+02}; + constexpr FLT c7[] = {2.4199726682552246E+01, -2.8393731159230907E+02, 5.1652001352658374E+02, 7.4578914842690025E+01, -1.1556759026394043E+03, 1.1556759026669868E+03, -7.4578914836335017E+01, -5.1652001352477316E+02, 2.8393731159271266E+02, -2.4199726682540764E+01}; + constexpr FLT c8[] = {-1.0545675122358718E+01, -3.0306758891736707E+00, 7.2305523762002423E+01, -1.3808908570315674E+02, 7.6293213390392353E+01, 7.6293213419941608E+01, -1.3808908572000124E+02, 7.2305523762424571E+01, -3.0306758892308885E+00, -1.0545675122367939E+01}; + constexpr FLT c9[] = {-2.1836930570445361E+00, 5.4992367507340179E+00, -4.5624617242018264E+00, -6.6492709812433128E+00, 2.0339240340948546E+01, -2.0339240355994509E+01, 6.6492709998185751E+00, 4.5624617253163429E+00, -5.4992367508385041E+00, 2.1836930570532433E+00}; + constexpr FLT c10[] = {-9.1748741454156318E-02, 5.2562451749078731E-01, -1.4144257942386596E+00, 1.8629579002072614E+00, -9.0169873685258095E-01, -9.0169875903814667E-01, 1.8629579050577161E+00, -1.4144257935638165E+00, 5.2562451754351402E-01, -9.1748741461736935E-02}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==11) { + constexpr FLT c0[] = {4.3537972057094375E+03, 9.8872306817881158E+05, 2.0938056062983297E+07, 1.3701428307175839E+08, 3.8828289972017384E+08, 5.4292197128519225E+08, 3.8828289972017366E+08, 1.3701428307175839E+08, 2.0938056062983308E+07, 9.8872306817881158E+05, 4.3537972057093921E+03}; + constexpr FLT c1[] = {1.7371472778611500E+04, 1.9155790709433779E+06, 2.4914432724618737E+07, 9.7792160665338382E+07, 1.3126779387874995E+08, -1.1645321713027108E-08, -1.3126779387875001E+08, -9.7792160665338382E+07, -2.4914432724618725E+07, -1.9155790709433777E+06, -1.7371472778611380E+04}; + constexpr FLT c2[] = {2.9650558537745463E+04, 1.6014973065836846E+06, 1.1867448782239098E+07, 2.0812212822540630E+07, -1.1749875870571045E+07, -4.5121922350041404E+07, -1.1749875870570999E+07, 2.0812212822540656E+07, 1.1867448782239093E+07, 1.6014973065836844E+06, 2.9650558537745292E+04}; + constexpr FLT c3[] = {2.8505604980264405E+04, 7.4166660874053370E+05, 2.5711466441825363E+06, -1.2146931938153724E+06, -8.3931576510115806E+06, 5.8947555067017928E-08, 8.3931576510117110E+06, 1.2146931938154269E+06, -2.5711466441825293E+06, -7.4166660874053300E+05, -2.8505604980264299E+04}; + constexpr FLT c4[] = {1.7045632829988484E+04, 1.9785834209758099E+05, 8.6361403553703407E+04, -1.0584472412325807E+06, -1.3367486018954750E+05, 1.7818009619468113E+06, -1.3367486018952320E+05, -1.0584472412325810E+06, 8.6361403553705750E+04, 1.9785834209758116E+05, 1.7045632829988426E+04}; + constexpr FLT c5[] = {6.5462464716912891E+03, 2.5347576368078731E+04, -7.5810878908802741E+04, -8.0774039751698409E+04, 2.5492801112953416E+05, 3.1373949311406158E-08, -2.5492801112952997E+05, 8.0774039751677527E+04, 7.5810878908807950E+04, -2.5347576368078797E+04, -6.5462464716912691E+03}; + constexpr FLT c6[] = {1.5684149291082226E+03, -1.0302687059850266E+03, -1.3446845770824604E+04, 2.0814393480318489E+04, 1.4366994276506950E+04, -4.4581342385966971E+04, 1.4366994276487216E+04, 2.0814393480327166E+04, -1.3446845770825106E+04, -1.0302687059851414E+03, 1.5684149291082156E+03}; + constexpr FLT c7[] = {1.9398419323286674E+02, -8.7329293867233980E+02, 2.4796533428845552E+02, 3.2905701326708659E+03, -4.8989871768521243E+03, 2.5910474731743909E-08, 4.8989871768931434E+03, -3.2905701326280059E+03, -2.4796533428623073E+02, 8.7329293867272952E+02, -1.9398419323288715E+02}; + constexpr FLT c8[] = {-4.2288232505094108E+00, -9.9574929618070513E+01, 2.9563077145679659E+02, -1.9453049353627330E+02, -4.0107401575324394E+02, 7.9532514191794951E+02, -4.0107401576649818E+02, -1.9453049352309569E+02, 2.9563077145970482E+02, -9.9574929617658114E+01, -4.2288232504962613E+00}; + constexpr FLT c9[] = {-5.3741131162116726E+00, 5.5350606001924518E+00, 1.9153744596147146E+01, -6.3189447496716646E+01, 6.6921287671707859E+01, -1.3450045688823196E-08, -6.6921287609294978E+01, 6.3189447455108059E+01, -1.9153744593546609E+01, -5.5350606002853286E+00, 5.3741131162113103E+00}; + constexpr FLT c10[] = {-7.0359426507051681E-01, 2.2229112760631806E+00, -3.2054079730741187E+00, 8.3392535011476268E-02, 6.8879260445103929E+00, -1.0795498350223303E+01, 6.8879260559828390E+00, 8.3392524213879743E-02, -3.2054079670004838E+00, 2.2229112761686296E+00, -7.0359426507381639E-01}; + constexpr FLT c11[] = {5.2648094862911970E-02, 9.9912561370710071E-02, -4.3913938793989010E-01, 7.9792986880755179E-01, -6.9191820607752896E-01, -3.1086723020887482E-08, 6.9191819251103082E-01, -7.9792986253876474E-01, 4.3913938485313375E-01, -9.9912561580306161E-02, -5.2648094876606648E-02}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.4299692685485479E+03, 2.0077596413122746E+06, 5.4904521978991687E+07, 4.5946106674819386E+08, 1.6835469840840111E+09, 3.1308386544851584E+09, 3.1308386544851594E+09, 1.6835469840840116E+09, 4.5946106674819499E+08, 5.4904521978991836E+07, 2.0077596413122742E+06, 6.4299692685634491E+03}; + constexpr FLT c1[] = {2.6965848540274084E+04, 4.1625245902732192E+06, 7.2097002594596982E+07, 3.8505085985474664E+08, 7.9479013671674263E+08, 4.7870231281824070E+08, -4.7870231281824070E+08, -7.9479013671674287E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732182E+06, -2.6965848540258085E+04}; + constexpr FLT c2[] = {4.8869694409905118E+04, 3.7863371066322499E+06, 3.9530526716552719E+07, 1.1475134266581047E+08, 4.6311261797931008E+07, -2.0442837194260687E+08, -2.0442837194260764E+08, 4.6311261797930703E+07, 1.1475134266581020E+08, 3.9530526716552772E+07, 3.7863371066322499E+06, 4.8869694409920470E+04}; + constexpr FLT c3[] = {5.0530564260114013E+04, 1.9615784087727305E+06, 1.1044597342441026E+07, 7.9812418612436997E+06, -3.4042228324588403E+07, -3.3301805987927672E+07, 3.3301805987928241E+07, 3.4042228324588865E+07, -7.9812418612435153E+06, -1.1044597342440989E+07, -1.9615784087727298E+06, -5.0530564260099913E+04}; + constexpr FLT c4[] = {3.3081876469965486E+04, 6.2011956881368393E+05, 1.3086001239863783E+06, -3.1165484297367223E+06, -5.1982996003441429E+06, 6.3530947749620415E+06, 6.3530947749622557E+06, -5.1982996003440823E+06, -3.1165484297365877E+06, 1.3086001239863841E+06, 6.2011956881368428E+05, 3.3081876469981347E+04}; + constexpr FLT c5[] = {1.4308966168506786E+04, 1.1375573205951968E+05, -1.0318195403423737E+05, -6.6892418721464148E+05, 5.9223570255464804E+05, 1.1093685152670993E+06, -1.1093685152665814E+06, -5.9223570255454781E+05, 6.6892418721485860E+05, 1.0318195403423111E+05, -1.1375573205951942E+05, -1.4308966168492359E+04}; + constexpr FLT c6[] = {4.0848961919701046E+03, 7.5033277163530902E+03, -5.2578904182708357E+04, 6.3431596330007251E+03, 1.5984798504282974E+05, -1.2521363434086266E+05, -1.2521363434064612E+05, 1.5984798504277965E+05, 6.3431596327688303E+03, -5.2578904182719976E+04, 7.5033277163531166E+03, 4.0848961919843532E+03}; + constexpr FLT c7[] = {7.1658797373677851E+02, -1.5499947984091114E+03, -4.5490740453145772E+03, 1.4520122796449663E+04, -3.7896465827621914E+03, -2.3597107892496744E+04, 2.3597107892730306E+04, 3.7896465829102508E+03, -1.4520122796250829E+04, 4.5490740453377412E+03, 1.5499947984094479E+03, -7.1658797372277252E+02}; + constexpr FLT c8[] = {5.2022749592536726E+01, -4.0624258132612465E+02, 5.2256582979411519E+02, 9.3282469962228390E+02, -2.8710622268636553E+03, 1.7594166900407929E+03, 1.7594166904608542E+03, -2.8710622266536416E+03, 9.3282469976057041E+02, 5.2256582978430436E+02, -4.0624258132566132E+02, 5.2022749606076808E+01}; + constexpr FLT c9[] = {-7.0341875498933257E+00, -2.3043166228613529E+01, 1.2279331781902621E+02, -1.6714687552668008E+02, -4.4746498567249184E+01, 3.6060905998808425E+02, -3.6060905975626497E+02, 4.4746498638578188E+01, 1.6714687551479193E+02, -1.2279331779450688E+02, 2.3043166229077912E+01, 7.0341875614883520E+00}; + constexpr FLT c10[] = {-2.1556100132578342E+00, 4.1361104015055048E+00, 1.8107701824759481E+00, -2.1223400283067541E+01, 3.5820961921268712E+01, -1.8782945757357222E+01, -1.8782945295761856E+01, 3.5820961970532480E+01, -2.1223400227730256E+01, 1.8107701446846367E+00, 4.1361104022646886E+00, -2.1556100021360516E+00}; + constexpr FLT c11[] = {-1.1440899376747989E-01, 7.0567641591059616E-01, -1.4530217944402339E+00, 1.0571984630250064E+00, 1.4389000408734942E+00, -4.2241734506571262E+00, 4.2241732732256922E+00, -1.4389001658681779E+00, -1.0571984849752754E+00, 1.4530218273656557E+00, -7.0567641625357191E-01, 1.1440900438178589E-01}; + constexpr FLT c12[] = {-1.4486009664532199E-02, 2.9387825785133236E-03, -1.0265970208873806E-01, 2.6748270027876714E-01, -3.3606433030575705E-01, 1.5850134054436241E-01, 1.5850148084990595E-01, -3.3606430399846576E-01, 2.6748282743067825E-01, -1.0265974511212309E-01, 2.9387825100049524E-03, -1.4486000362352570E-02}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {9.3397060605267925E+03, 3.9447202186643188E+06, 1.3701428307175836E+08, 1.4375660883001420E+09, 6.6384519128895750E+09, 1.5848048271166540E+10, 2.1031560281976685E+10, 1.5848048271166515E+10, 6.6384519128895721E+09, 1.4375660883001390E+09, 1.3701428307175830E+08, 3.9447202186642904E+06, 9.3397060605267870E+03}; + constexpr FLT c1[] = {4.0984512931817779E+04, 8.6828943763566837E+06, 1.9558432133067667E+08, 1.3674961320373521E+09, 3.9251291128182445E+09, 4.5116631434426517E+09, -5.2784645410468957E-07, -4.5116631434426460E+09, -3.9251291128182430E+09, -1.3674961320373495E+09, -1.9558432133067659E+08, -8.6828943763566315E+06, -4.0984512931817771E+04}; + constexpr FLT c2[] = {7.8379538318778941E+04, 8.4928073133582622E+06, 1.1992091153966446E+08, 5.0561697705436689E+08, 6.1845897311594033E+08, -5.1306326495404607E+08, -1.4790096327029381E+09, -5.1306326495404249E+08, 6.1845897311593974E+08, 5.0561697705436635E+08, 1.1992091153966436E+08, 8.4928073133582175E+06, 7.8379538318778941E+04}; + constexpr FLT c3[] = {8.6417670227040027E+04, 4.8250267333349725E+06, 3.9836803808039062E+07, 7.5026052902191281E+07, -7.7565422849559024E+07, -2.5393835488011667E+08, 3.3249826368607219E-06, 2.5393835488012213E+08, 7.7565422849558040E+07, -7.5026052902191922E+07, -3.9836803808038987E+07, -4.8250267333349492E+06, -8.6417670227040042E+04}; + constexpr FLT c4[] = {6.1161604972829395E+04, 1.7331203720075563E+06, 7.0216196997559210E+06, -3.6027138646115125E+06, -3.1775875626363419E+07, 1.6544480876799976E+06, 4.9816566960117713E+07, 1.6544480876825110E+06, -3.1775875626362957E+07, -3.6027138646109658E+06, 7.0216196997559462E+06, 1.7331203720075507E+06, 6.1161604972829424E+04}; + constexpr FLT c5[] = {2.9177164557155927E+04, 3.9318079134661297E+05, 3.1307448297762702E+05, -2.7571366584958737E+06, -9.8421840747392213E+05, 6.8469173866723683E+06, 2.8271164666996988E-07, -6.8469173866687613E+06, 9.8421840747752984E+05, 2.7571366584952055E+06, -3.1307448297760193E+05, -3.9318079134661169E+05, -2.9177164557155942E+04}; + constexpr FLT c6[] = {9.5097815505886592E+03, 4.8799940773717601E+04, -1.2734023162442955E+05, -2.5472337176560360E+05, 6.3596049196317361E+05, 2.2361868201724227E+05, -1.0716559939672153E+06, 2.2361868202200226E+05, 6.3596049196156661E+05, -2.5472337176510989E+05, -1.2734023162441404E+05, 4.8799940773715760E+04, 9.5097815505886429E+03}; + constexpr FLT c7[] = {2.0601715730545525E+03, 1.9365931141588459E+02, -2.5304303117500138E+04, 2.9151392447016315E+04, 5.9055020355996137E+04, -1.1784846181768291E+05, 2.6154044742765007E-06, 1.1784846181457305E+05, -5.9055020356659290E+04, -2.9151392447180453E+04, 2.5304303117533978E+04, -1.9365931141453160E+02, -2.0601715730545707E+03}; + constexpr FLT c8[] = {2.5975061893406377E+02, -1.0025387650570891E+03, -6.8642481197673135E+02, 6.7515314203707721E+03, -7.0772939651788483E+03, -6.5444514138990871E+03, 1.6566898963252905E+04, -6.5444514157945678E+03, -7.0772939632859488E+03, 6.7515314204902643E+03, -6.8642481194565551E+02, -1.0025387650535661E+03, 2.5975061893407650E+02}; + constexpr FLT c9[] = {5.8705282128692158E+00, -1.4424362302794552E+02, 3.3390627212323119E+02, 4.8151337259952918E+01, -1.1431733956368030E+03, 1.4557114776348812E+03, -3.3159944254032091E-07, -1.4557114806782522E+03, 1.1431733967780669E+03, -4.8151337378834590E+01, -3.3390627213511937E+02, 1.4424362302320881E+02, -5.8705282128605081E+00}; + constexpr FLT c10[] = {-4.0954969508851224E+00, -1.2634947171672739E+00, 3.8134139827368251E+01, -8.4115524684139231E+01, 4.2766848660349709E+01, 1.0573434367831015E+02, -1.9636661091449494E+02, 1.0573435467021281E+02, 4.2766847947710779E+01, -8.4115525105243464E+01, 3.8134139870558698E+01, -1.2634947126121756E+00, -4.0954969508837991E+00}; + constexpr FLT c11[] = {-6.2702735485690120E-01, 1.8595467760284645E+00, -1.3027978720941771E+00, -4.9265267037365117E+00, 1.3906831814366365E+01, -1.3753763493382712E+01, 2.6871064791607931E-07, 1.3753755542502716E+01, -1.3906831747296087E+01, 4.9265273573671839E+00, 1.3027978458757612E+00, -1.8595467797630605E+00, 6.2702735484380401E-01}; + constexpr FLT c12[] = {-4.8290636698016143E-02, 1.7531876457248552E-01, -5.0041296501579524E-01, 6.3665129689096389E-01, -1.2477021972354120E-02, -1.2061605995627183E+00, 1.8595304429529254E+00, -1.2061634758265700E+00, -1.2475794298747987E-02, 6.3665098120347430E-01, -5.0041293542010268E-01, 1.7531876909405444E-01, -4.8290636687311379E-02}; + constexpr FLT c13[] = {2.2894665623763296E-02, -7.1358251863425162E-03, -1.4950753078549017E-02, 7.0611554068321924E-02, -1.2311301880976686E-01, 1.0342486048127918E-01, -6.8988570158793749E-07, -1.0342802294420825E-01, 1.2311280070887519E-01, -7.0611922113576600E-02, 1.4950741151156504E-02, 7.1358201810974436E-03, -2.2894665619603353E-02}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.3368785683552924E+04, 7.5304732752870303E+06, 3.2765764524435025E+08, 4.2418096936485295E+09, 2.4197690538177547E+10, 7.2227640697189728E+10, 1.2261475327356721E+11, 1.2261475327356729E+11, 7.2227640697189728E+10, 2.4197690538177608E+10, 4.2418096936485305E+09, 3.2765764524435204E+08, 7.5304732752870284E+06, 1.3368785683578022E+04}; + constexpr FLT c1[] = {6.1154444023081698E+04, 1.7488686085101545E+07, 5.0279014009863281E+08, 4.4777867842655859E+09, 1.6916819861812075E+10, 2.8971884004562843E+10, 1.6054555293734529E+10, -1.6054555293734520E+10, -2.8971884004562851E+10, -1.6916819861812094E+10, -4.4777867842655849E+09, -5.0279014009863436E+08, -1.7488686085101552E+07, -6.1154444023056109E+04}; + constexpr FLT c2[] = {1.2279790808348054E+05, 1.8230319600271538E+07, 3.3815815633684015E+08, 1.9369899011251259E+09, 3.9743454154781294E+09, 7.4954544638351953E+08, -7.0173920607394953E+09, -7.0173920607394981E+09, 7.4954544638350523E+08, 3.9743454154781094E+09, 1.9369899011251252E+09, 3.3815815633684099E+08, 1.8230319600271549E+07, 1.2279790808350702E+05}; + constexpr FLT c3[] = {1.4339321200624772E+05, 1.1200899688172197E+07, 1.2799140125169736E+08, 4.0176966726270700E+08, 7.9146174555817381E+07, -1.1719748245183482E+09, -9.6919138198233318E+08, 9.6919138198235631E+08, 1.1719748245183690E+09, -7.9146174555820629E+07, -4.0176966726270568E+08, -1.2799140125169775E+08, -1.1200899688172201E+07, -1.4339321200622563E+05}; + constexpr FLT c4[] = {1.0866548538632697E+05, 4.4565213401510660E+06, 2.8354150929531515E+07, 2.2805067924010411E+07, -1.2058223609888455E+08, -1.2775415620367479E+08, 1.9261201640091833E+08, 1.9261201640092278E+08, -1.2775415620368402E+08, -1.2058223609887798E+08, 2.2805067924010262E+07, 2.8354150929531977E+07, 4.4565213401510660E+06, 1.0866548538635395E+05}; + constexpr FLT c5[] = {5.6346565047794371E+04, 1.1743908345502394E+06, 3.0601086667308519E+06, -7.2274020134796854E+06, -1.6220595157138506E+07, 2.0773587344464455E+07, 2.8183198298702076E+07, -2.8183198298697799E+07, -2.0773587344463386E+07, 1.6220595157145990E+07, 7.2274020134800859E+06, -3.0601086667311694E+06, -1.1743908345502326E+06, -5.6346565047771030E+04}; + constexpr FLT c6[] = {2.0435142564639620E+04, 1.9450977300079435E+05, -1.1234667576916210E+05, -1.5205767549239143E+06, 1.0515640561116433E+06, 3.7458351782459249E+06, -3.3794074240140119E+06, -3.3794074240169711E+06, 3.7458351782412329E+06, 1.0515640561062016E+06, -1.5205767549244103E+06, -1.1234667576906871E+05, 1.9450977300078108E+05, 2.0435142564663318E+04}; + constexpr FLT c7[] = {5.1491366053560578E+03, 1.4735748500446980E+04, -8.1689482343558659E+04, -3.5176894225535718E+04, 3.7034248411029513E+05, -1.9109669530087037E+05, -5.2637978465954703E+05, 5.2637978466513811E+05, 1.9109669530731969E+05, -3.7034248412243859E+05, 3.5176894226134398E+04, 8.1689482343736949E+04, -1.4735748500440675E+04, -5.1491366053330503E+03}; + constexpr FLT c8[] = {8.5138795113642539E+02, -1.2978618911724870E+03, -8.7500873646799319E+03, 2.1319159614070901E+04, 7.6586611596445446E+03, -6.2424139814276627E+04, 4.2620771484048986E+04, 4.2620771487400976E+04, -6.2424139811762492E+04, 7.6586611726886877E+03, 2.1319159614126653E+04, -8.7500873648028410E+03, -1.2978618911666397E+03, 8.5138795115875746E+02}; + constexpr FLT c9[] = {7.2176142041601707E+01, -4.5543406154804239E+02, 2.8301959889246939E+02, 2.1994171513294418E+03, -4.5082500681007541E+03, 4.7658016701186381E+02, 7.1044827179414842E+03, -7.1044827207946446E+03, -4.7658016510975699E+02, 4.5082500692420190E+03, -2.1994171509014677E+03, -2.8301959872009093E+02, 4.5543406154544186E+02, -7.2176142022434362E+01}; + constexpr FLT c10[] = {-3.1135380162987940E+00, -3.8554406978579038E+01, 1.4396028115898400E+02, -1.1260050343554748E+02, -3.0073664795307559E+02, 7.2079162583931463E+02, -4.1195307853504261E+02, -4.1195308389061950E+02, 7.2079161951195317E+02, -3.0073665201295637E+02, -1.1260050330597517E+02, 1.4396028109959775E+02, -3.8554406977567140E+01, -3.1135379980017595E+00}; + constexpr FLT c11[] = {-1.6022934776926798E+00, 1.8678197421256739E+00, 8.3368944138930399E+00, -3.0791579027234270E+01, 3.4749714150762280E+01, 1.2322523792409507E+01, -7.3924012166427417E+01, 7.3924001493712765E+01, -1.2322523909478123E+01, -3.4749718994457659E+01, 3.0791578402870758E+01, -8.3368943163363198E+00, -1.8678197396867300E+00, 1.6022934951962213E+00}; + constexpr FLT c12[] = {-1.9362061844377096E-01, 6.3024467546449237E-01, -9.3262282246103156E-01, -4.8908745811188170E-01, 4.0479355563504544E+00, -6.2829791472071852E+00, 3.1767781035894589E+00, 3.1767769811448687E+00, -6.2829724125407163E+00, 4.0479411685726534E+00, -4.8908752826470542E-01, -9.3262301538118120E-01, 6.3024467436836862E-01, -1.9362060312354304E-01}; + constexpr FLT c13[] = {1.8785913715361053E-02, 3.1605272623671174E-02, -1.3655798799707175E-01, 2.5016548497515428E-01, -1.6654380378010236E-01, -2.1682631004979175E-01, 6.1785823408636587E-01, -6.1786412281044067E-01, 2.1682412904087514E-01, 1.6654140467029407E-01, -2.5016543044993139E-01, 1.3655803570664179E-01, -3.1605272197692873E-02, -1.8785905270673971E-02}; + constexpr FLT c14[] = {-1.2896545121493665E-02, -3.7106960851979211E-03, 5.8859140039070395E-04, 1.3987190631712249E-02, -3.5710919113872190E-02, 4.3405397573933885E-02, -2.0030939379906375E-02, -2.0032731865340953E-02, 4.3401439168598052E-02, -3.5712796955756618E-02, 1.3987489379284932E-02, 5.8862874383716927E-04, -3.7106965853333437E-03, -1.2896537371347905E-02}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {1.8887777774374495E+04, 1.4015330434461441E+07, 7.5498683300180113E+08, 1.1900937739619959E+10, 8.2530965279375427E+10, 3.0178246269069617E+11, 6.3775691457119177E+11, 8.1471473119305627E+11, 6.3775691457119177E+11, 3.0178246269069659E+11, 8.2530965279375626E+10, 1.1900937739619970E+10, 7.5498683300180113E+08, 1.4015330434461441E+07, 1.8887777774374499E+04}; + constexpr FLT c1[] = {8.9780907163796350E+04, 3.4167636285297170E+07, 1.2346880033823483E+09, 1.3719272724135921E+10, 6.5858241494816727E+10, 1.5266999939989542E+11, 1.5687794513790732E+11, 8.2054309331652521E-05, -1.5687794513790729E+11, -1.5266999939989551E+11, -6.5858241494816811E+10, -1.3719272724135935E+10, -1.2346880033823485E+09, -3.4167636285297155E+07, -8.9780907163796262E+04}; + constexpr FLT c2[] = {1.8850321233130724E+05, 3.7693640983013548E+07, 8.9846818051570022E+08, 6.7094088040439663E+09, 1.9743296615199219E+10, 1.8072727219391186E+10, -2.0634615374559433E+10, -4.9654335197177406E+10, -2.0634615374559402E+10, 1.8072727219391071E+10, 1.9743296615199223E+10, 6.7094088040439653E+09, 8.9846818051569998E+08, 3.7693640983013526E+07, 1.8850321233130703E+05}; + constexpr FLT c3[] = {2.3185006533495741E+05, 2.4789475362741619E+07, 3.7751696829092431E+08, 1.7167916788178215E+09, 1.9832401267745426E+09, -3.4881359830883756E+09, -7.8785602379628572E+09, 9.7140016072625200E-05, 7.8785602379629736E+09, 3.4881359830884337E+09, -1.9832401267745149E+09, -1.7167916788178086E+09, -3.7751696829092413E+08, -2.4789475362741601E+07, -2.3185006533495741E+05}; + constexpr FLT c4[] = {1.8672970114818294E+05, 1.0741068109706741E+07, 9.8017949708492860E+07, 2.0291084954252207E+08, -2.7857869294215119E+08, -9.4112677968749356E+08, 1.7886520649348873E+08, 1.4579673547892964E+09, 1.7886520649342585E+08, -9.4112677968752539E+08, -2.7857869294214994E+08, 2.0291084954251558E+08, 9.8017949708492786E+07, 1.0741068109706741E+07, 1.8672970114818294E+05}; + constexpr FLT c5[] = {1.0411891611891470E+05, 3.1771463075269503E+06, 1.4880104152842240E+07, -6.8136965447559115E+06, -8.7072998215433106E+07, 1.8024116531034056E+06, 1.9067730799617344E+08, 4.2457739417067258E-05, -1.9067730799613068E+08, -1.8024116529409259E+06, 8.7072998215441659E+07, 6.8136965447553769E+06, -1.4880104152842039E+07, -3.1771463075269512E+06, -1.0411891611891471E+05}; + constexpr FLT c6[] = {4.1300641422694804E+04, 6.3217168592498475E+05, 7.7343707634861500E+05, -5.4575962381464886E+06, -3.7387211063140454E+06, 1.8451583614096310E+07, 3.0480804947991944E+06, -2.7500445095909819E+07, 3.0480804948348333E+06, 1.8451583614054784E+07, -3.7387211062913244E+06, -5.4575962381459959E+06, 7.7343707634824759E+05, 6.3217168592497776E+05, 4.1300641422694753E+04}; + constexpr FLT c7[] = {1.1710443348523793E+04, 7.5405449195728594E+04, -1.6634736996463325E+05, -5.6069290801800112E+05, 1.1540571564075467E+06, 1.0209821661192341E+06, -2.9641921942296810E+06, 3.3808352628184138E-05, 2.9641921942798980E+06, -1.0209821662794619E+06, -1.1540571563939669E+06, 5.6069290802062431E+05, 1.6634736996474760E+05, -7.5405449195719484E+04, -1.1710443348523821E+04}; + constexpr FLT c8[] = {2.3142324239350878E+03, 2.1710560541685127E+03, -3.6929625713073510E+04, 2.6143898219454975E+04, 1.4046980089280056E+05, -2.1033190113776314E+05, -1.1132269821056565E+05, 3.7491447377567255E+05, -1.1132269820392072E+05, -2.1033190119832297E+05, 1.4046980086087715E+05, 2.6143898218932318E+04, -3.6929625712961781E+04, 2.1710560541720374E+03, 2.3142324239350669E+03}; + constexpr FLT c9[] = {2.8879718294280184E+02, -9.2801372612475961E+02, -1.9817144426574330E+03, 9.9004179204792053E+03, -5.7928269087620147E+03, -2.1083466263505023E+04, 3.3285501948595454E+04, -2.7485328636422507E-05, -3.3285501965333991E+04, 2.1083466366979632E+04, 5.7928269521300508E+03, -9.9004179216204702E+03, 1.9817144428595318E+03, 9.2801372612847467E+02, -2.8879718294283089E+02}; + constexpr FLT c10[] = {1.3121871131812668E+01, -1.5978845116799533E+02, 2.7429718922951372E+02, 4.4598059414156506E+02, -1.8917609553066516E+03, 1.5303002688244715E+03, 1.7542368497545090E+03, -3.9411530602516441E+03, 1.7542369316431223E+03, 1.5303002442924305E+03, -1.8917609584163495E+03, 4.4598059457347478E+02, 2.7429718902435877E+02, -1.5978845117002061E+02, 1.3121871131803672E+01}; + constexpr FLT c11[] = {-2.4286151057240977E+00, -6.7839829107457454E+00, 4.6999223071396322E+01, -7.4896070961958642E+01, -3.2010113081168477E+01, 2.5022928265034139E+02, -2.8786059319143976E+02, -7.6634590881515742E-06, 2.8786055354435149E+02, -2.5022938574837804E+02, 3.2010133958326769E+01, 7.4896073537458122E+01, -4.6999222973839679E+01, 6.7839829144042234E+00, 2.4286151057002718E+00}; + constexpr FLT c12[] = {-5.4810555663540994E-01, 1.1436870829533889E+00, 8.2471503038810468E-01, -8.5602133190676231E+00, 1.5631626747736027E+01, -6.4979530690388971E+00, -1.8737705444912390E+01, 3.3283700586432069E+01, -1.8737671771580779E+01, -6.4980608237023150E+00, 1.5631576518348636E+01, -8.5602150728872868E+00, 8.2471496023535673E-01, 1.1436870829534245E+00, -5.4810555666110816E-01}; + constexpr FLT c13[] = {-1.4554612894071435E-02, 1.7022157798828938E-01, -3.7563883252838998E-01, 2.0131137597017346E-01, 8.3554102633770899E-01, -2.1191293316246047E+00, 1.9960663397068628E+00, -2.3728355667610635E-05, -1.9960994910423950E+00, 2.1191258420103383E+00, -8.3552532307350946E-01, -2.0131366602953590E-01, 3.7563888705361287E-01, -1.7022157564540871E-01, 1.4554612874103701E-02}; + constexpr FLT c14[] = {-1.2348455954758902E-02, 2.6143546776172359E-03, -2.9252135300577905E-02, 7.5391681327619392E-02, -8.7984403647335341E-02, 1.3344627281489669E-03, 1.5252941418184685E-01, -2.3235937480302737E-01, 1.5257226311939021E-01, 1.3278049251030887E-03, -8.7990378598784807E-02, 7.5392790961460260E-02, -2.9252188648358976E-02, 2.6143533439228375E-03, -1.2348455958015002E-02}; + constexpr FLT c15[] = {1.4214685601398354E-02, -1.2364336624800189E-03, 1.2892619016815934E-03, 1.6178062163508013E-03, -8.2136742192079667E-03, 1.3906385413195475E-02, -1.1450713230272313E-02, -3.7721726447119798E-06, 1.1423376007684534E-02, -1.3922509066323734E-02, 8.2263143670307064E-03, -1.6156663488059737E-03, -1.2892038432598459E-03, 1.2364357359950825E-03, -1.4214685605448193E-02}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {2.6374086784014766E+04, 2.5501413681212697E+07, 1.6835469840840111E+09, 3.1953580806547901E+10, 2.6584910126662793E+11, 1.1715858191494631E+12, 3.0181658330343154E+12, 4.7888775408612793E+12, 4.7888775408612793E+12, 3.0181658330343149E+12, 1.1715858191494631E+12, 2.6584910126662802E+11, 3.1953580806547905E+10, 1.6835469840840123E+09, 2.5501413681212693E+07, 2.6374086784014838E+04}; + constexpr FLT c1[] = {1.2991568388123445E+05, 6.4986154651133671E+07, 2.9142305012947264E+09, 3.9748054433728172E+10, 2.3649443248440253E+11, 7.0471088240421252E+11, 1.0533888905987035E+12, 5.4832304482297614E+11, -5.4832304482297620E+11, -1.0533888905987037E+12, -7.0471088240421265E+11, -2.3649443248440253E+11, -3.9748054433728172E+10, -2.9142305012947268E+09, -6.4986154651133649E+07, -1.2991568388123452E+05}; + constexpr FLT c2[] = {2.8421223836872837E+05, 7.5448503558118597E+07, 2.2710828032883873E+09, 2.1491603403163834E+10, 8.4299374042308197E+10, 1.3384457365769531E+11, 1.8630012765538406E+09, -2.4384536789321063E+11, -2.4384536789321036E+11, 1.8630012765533686E+09, 1.3384457365769537E+11, 8.4299374042308105E+10, 2.1491603403163818E+10, 2.2710828032883859E+09, 7.5448503558118537E+07, 2.8421223836872837E+05}; + constexpr FLT c3[] = {3.6653021243297530E+05, 5.2693428548387125E+07, 1.0410094433021290E+09, 6.3986267576853638E+09, 1.3313926739756351E+10, -2.7909761561126175E+09, -3.9911638977027939E+10, -2.9236947704012280E+10, 2.9236947704013081E+10, 3.9911638977028137E+10, 2.7909761561130028E+09, -1.3313926739756271E+10, -6.3986267576853542E+09, -1.0410094433021282E+09, -5.2693428548387118E+07, -3.6653021243297530E+05}; + constexpr FLT c4[] = {3.1185660915838124E+05, 2.4564274645530283E+07, 3.0509279143241888E+08, 1.0432225146182600E+09, 6.4966284440289930E+07, -4.2483903608015141E+09, -3.1778261722520151E+09, 5.9880587942837610E+09, 5.9880587942838221E+09, -3.1778261722524805E+09, -4.2483903608015366E+09, 6.4966284440239742E+07, 1.0432225146182716E+09, 3.0509279143241870E+08, 2.4564274645530298E+07, 3.1185660915838124E+05}; + constexpr FLT c5[] = {1.8544733523229556E+05, 7.9824949938292857E+06, 5.6880943382648587E+07, 5.4097201999261037E+07, -3.0776449202831459E+08, -3.7659931821870732E+08, 6.8797698944740057E+08, 7.5429896889854825E+08, -7.5429896889813769E+08, -6.8797698944685316E+08, 3.7659931821880990E+08, 3.0776449202837443E+08, -5.4097201999261037E+07, -5.6880943382648058E+07, -7.9824949938292904E+06, -1.8544733523229562E+05}; + constexpr FLT c6[] = {7.9472339236673346E+04, 1.8159676553648554E+06, 5.7259818806757703E+06, -1.2786136236414703E+07, -3.8677490873126298E+07, 4.7651450515746824E+07, 9.0723760109486386E+07, -9.4532949239712372E+07, -9.4532949239553988E+07, 9.0723760109301269E+07, 4.7651450515691362E+07, -3.8677490873146154E+07, -1.2786136236417659E+07, 5.7259818806749191E+06, 1.8159676553648303E+06, 7.9472339236673288E+04}; + constexpr FLT c7[] = {2.4831718998299966E+04, 2.7536301841718081E+05, -5.1045953355375612E+04, -2.6996387880195463E+06, 1.1656554632389303E+06, 9.1521923450131379E+06, -6.8198180924866442E+06, -1.2555197000819867E+07, 1.2555197001241650E+07, 6.8198180927697066E+06, -9.1521923448700085E+06, -1.1656554631878142E+06, 2.6996387880213680E+06, 5.1045953356119258E+04, -2.7536301841717307E+05, -2.4831718998299926E+04}; + constexpr FLT c8[] = {5.6060763597396308E+03, 2.2154740880106889E+04, -1.0243462874801211E+05, -1.1802198892514131E+05, 6.4061699367996352E+05, -1.1166716767206143E+05, -1.4153578101430011E+06, 1.0790712966724981E+06, 1.0790712967259965E+06, -1.4153578105201155E+06, -1.1166716749694763E+05, 6.4061699367337034E+05, -1.1802198891465126E+05, -1.0243462874806672E+05, 2.2154740880108289E+04, 5.6060763597395980E+03}; + constexpr FLT c9[] = {8.7271993222052015E+02, -7.0074676858636565E+02, -1.2528372958260919E+04, 2.3643101058174649E+04, 3.1699060176870429E+04, -1.1270133590467999E+05, 3.6872846694334214E+04, 1.5168911740364679E+05, -1.5168911743408049E+05, -3.6872846682160729E+04, 1.1270133589250650E+05, -3.1699060125133125E+04, -2.3643101053990013E+04, 1.2528372958926657E+04, 7.0074676859379576E+02, -8.7271993222046206E+02}; + constexpr FLT c10[] = {7.8842259458809167E+01, -4.2070880912368045E+02, -1.0535142084668550E+02, 3.3375056840527291E+03, -4.9426353391946941E+03, -3.6567309106352213E+03, 1.5199085303756190E+04, -9.4972223386509122E+03, -9.4972222612539845E+03, 1.5199085250589107E+04, -3.6567308608802218E+03, -4.9426353295200679E+03, 3.3375056868169195E+03, -1.0535142136497778E+02, -4.2070880912233122E+02, 7.8842259458809863E+01}; + constexpr FLT c11[] = {8.9833076822322541E-02, -4.4163371176090656E+01, 1.2880771155499514E+02, 2.8722193371824223E+00, -5.7164633743445722E+02, 9.0417612969072786E+02, 1.1220387898916500E+00, -1.4190926236781661E+03, 1.4190921497862169E+03, -1.1219395160922474E+00, -9.0417626783116691E+02, 5.7164631339646269E+02, -2.8722233955477368E+00, -1.2880771178913139E+02, 4.4163371168774162E+01, -8.9833076836661779E-02}; + constexpr FLT c12[] = {-1.0900468357478950E+00, -1.1264666525354303E-01, 1.1810668147959248E+01, -3.0289105313513339E+01, 1.5494580774353590E+01, 6.0129886123389447E+01, -1.2330199171381130E+02, 6.7114507519752891E+01, 6.7114417724195803E+01, -1.2330220722314033E+02, 6.0129944490502041E+01, 1.5494578529464169E+01, -3.0289104892597450E+01, 1.1810668147959559E+01, -1.1264666963803399E-01, -1.0900468357479236E+00}; + constexpr FLT c13[] = {-1.1763610120003680E-01, 4.2939195911805172E-01, -2.7950209959937194E-01, -1.7354549670508441E+00, 5.1182015415147619E+00, -5.0538827161604676E+00, -2.1270036462171213E+00, 1.0709458682620088E+01, -1.0709612225647817E+01, 2.1267942693611270E+00, 5.0538338615607357E+00, -5.1181806038291624E+00, 1.7354571480597607E+00, 2.7950229043765212E-01, -4.2939195443229039E-01, 1.1763610122666045E-01}; + constexpr FLT c14[] = {-1.8020499668410097E-02, 3.6694580839244442E-02, -1.1331134794057113E-01, 1.3971228975695787E-01, 8.1734604430561311E-02, -5.4464516301492671E-01, 7.9646109231150031E-01, -3.9024149191964747E-01, -3.9020325223035940E-01, 7.9644613359376126E-01, -5.4458780348100966E-01, 8.1735287282159258E-02, 1.3971280189565236E-01, -1.1331156133169454E-01, 3.6694584840328316E-02, -1.8020499652780946E-02}; + constexpr FLT c15[] = {1.4589783473923206E-02, -7.8885429103313365E-04, -4.4856766056362643E-03, 1.8116483572926646E-02, -3.0574294775135746E-02, 1.8967420978453962E-02, 2.4666137072064612E-02, -6.8017929307730221E-02, 6.7615302446897660E-02, -2.4691085605299815E-02, -1.9038882601578176E-02, 3.0552398456072709E-02, -1.8118938614760938E-02, 4.4854443719491892E-03, 7.8884755210919307E-04, -1.4589783498222219E-02}; + constexpr FLT c16[] = {-1.0467998078291846E-02, -3.2140608463710125E-04, 5.2959666930518063E-04, -1.5769844275261027E-04, -1.4331371817542763E-03, 3.7100687637655694E-03, -3.8742310984482158E-03, 1.6810223071268796E-03, 1.6547563335702548E-03, -3.9924279794162345E-03, 3.6969357769948610E-03, -1.4380620517984166E-03, -1.5934006609813836E-04, 5.2953895598459668E-04, -3.2140848935911386E-04, -1.0467998075160606E-02}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else + printf("width not implemented!\n"); From db0457ab75d39e6d65c717c82cbc971440c83bad Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:00:09 -0400 Subject: [PATCH 16/39] restoring .m from master --- devel/gen_all_horner_C_code.m | 26 +++++++------------------- devel/gen_ker_horner_loop_C_code.m | 4 ++-- 2 files changed, 9 insertions(+), 21 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index 754d91d61..009e05ea4 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -10,26 +10,14 @@ clear opts = struct(); -ws = 2:16; -upsampfac = 1.25; % sigma (upsampling): either 2 (default) or low (eg 5/4). -opts.wpad = false; % pad kernel eval to multiple of 4 +for upsampfac = [2.0, 1.25]; % sigma: either 2 (default) or low (eg 5/4) + fprintf('upsampfac = %g...\n',upsampfac) + + ws = 2:16; + opts.wpad = true; % pad kernel eval to multiple of 4 -if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop.inc','w'); -else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc','w'); -end -fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); -fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); -for j=1:numel(ws) - w = ws(j) - if upsampfac==2 % hardwire the betas for this default case - betaoverws = [2.20 2.26 2.38 2.30]; % matches setup_spreader - beta = betaoverws(min(4,w-1)) * w; % uses last entry for w>=5 - d = w + 2 + (w<=8); % between 2-3 more degree than w - else % use formulae, must match params in setup_spreader... - gamma=0.97; % safety factor - betaoverws = gamma*pi*(1-1/(2*upsampfac)); % from cutoff freq formula - beta = betaoverws * w; - d = w + 1 + (w<=8); % less, since beta smaller, smoother + if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w'); + else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m index d1a635536..e2dd1b75a 100644 --- a/devel/gen_ker_horner_loop_C_code.m +++ b/devel/gen_ker_horner_loop_C_code.m @@ -37,8 +37,8 @@ else width = w; end -for n=1:d % loop over poly coeff powers - s = sprintf('constexpr FLT c%d[] = {%.16E',n-1, C(n,1)); +for n=1:d+1 % loop over poly coeff powers + s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1)); for i=2:width % loop over segments s = sprintf('%s, %.16E', s, C(n,i)); end From d0ce11e718f257a4bf1bef94ff014f7a64b8e323 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:11:05 -0400 Subject: [PATCH 17/39] updated hook --- .pre-commit-config.yaml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3b839e6f5..ac25e3f63 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,5 +1,12 @@ repos: - repo: https://github.com/pre-commit/mirrors-clang-format - rev: 'v18.1.6' + rev: 'v18.1.8' hooks: - id: clang-format + types_or: [c++, c, cuda] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.3.0 + hooks: + - id: check-yaml + - id: end-of-file-fixer + - id: trailing-whitespace From 798717d9a6f97f4c5f0c6904de89b10d06e3fa61 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:44:50 -0400 Subject: [PATCH 18/39] updated coefficients --- devel/gen_all_horner_C_code.m | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m index baf6590cf..51aa4e4e1 100644 --- a/devel/gen_all_horner_C_code.m +++ b/devel/gen_all_horner_C_code.m @@ -16,8 +16,8 @@ ws = 2:16; opts.wpad = false; % pad kernel eval to multiple of 4 - if upsampfac==2, fid = fopen('../inclue/cuda/contrib/ker_horner_allw_loop_constexpr.inc','w'); - else, fid = fopen('../inclue/cuda/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); + if upsampfac==2, fid = fopen('../include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc','w'); + else, fid = fopen('../include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc','w'); end fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n')); fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n')); From 282baf50ea3a4d000e1cc3ed40d940424d54639a Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 12:46:37 -0400 Subject: [PATCH 19/39] new coeffs --- .../ker_horner_allw_loop_constexpr.inc | 205 +++++++++++++++++ ...owupsampfac_horner_allw_loop_constexpr.inc | 171 +++++++++++++++ src/ker_horner_allw_loop.inc | 207 ++++++++++++++++++ 3 files changed, 583 insertions(+) create mode 100644 include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc create mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc create mode 100644 src/ker_horner_allw_loop.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc new file mode 100644 index 000000000..1f4c59e2a --- /dev/null +++ b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc @@ -0,0 +1,205 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; + constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; + constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; + constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; + constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; + constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==4) { + constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; + constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; + constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; + constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; + constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; + constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==5) { + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; + constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; + constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; + constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; + constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; + constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; + constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==6) { + constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; + constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; + constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; + constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; + constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; + constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; + constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; + constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==7) { + constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; + constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; + constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; + constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; + constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; + constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; + constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; + constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; + constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; + constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==8) { + constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; + constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; + constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; + constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; + constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; + constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; + constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; + constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; + constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; + constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; + constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; + constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; + constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; + constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; + constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; + constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; + constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; + constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; + constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; + constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; + constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; + constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; + constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; + constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; + constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; + constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; + constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; + constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==11) { + constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; + constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; + constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; + constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; + constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; + constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; + constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; + constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; + constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; + constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; + constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; + constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; + constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; + constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; + constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; + constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; + constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; + constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; + constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; + constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; + constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; + constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; + constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; + constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; + constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; + constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; + constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; + constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; + constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; + constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; + constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; + constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; + constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; + constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; + constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; + constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; + constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; + constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; + constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; + constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; + constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; + constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; + constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; + constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; + constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; + constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; + constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; + constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; + constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; + constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; + constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; + constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; + constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; + constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; + constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; + constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; + constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; + constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; + constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; + constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; + constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; + constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; + constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; + constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; + constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; + constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; + constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; + constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; + constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; + constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; + constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; + constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; + constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; + constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; + constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; + constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; + constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; + constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; + constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; + constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc new file mode 100644 index 000000000..e2fa229b7 --- /dev/null +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc @@ -0,0 +1,171 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; + constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; + constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; + constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if (w==3) { + constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; + constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; + constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; + constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; + constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==4) { + constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; + constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; + constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; + constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; + constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; + constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==5) { + constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; + constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; + constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; + constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; + constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; + constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==6) { + constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; + constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; + constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; + constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; + constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; + constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==7) { + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; + constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; + constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; + constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; + constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; + constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; + constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; + constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==8) { + constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; + constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; + constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; + constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; + constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; + constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; + constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; + constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; + constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; + constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; + constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; + constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; + constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==10) { + constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; + constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; + constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; + constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; + constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; + constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; + constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; + constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; + constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; + constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==11) { + constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; + constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; + constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; + constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; + constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; + constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; + constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; + constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; + constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; + constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==12) { + constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; + constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; + constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; + constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; + constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; + constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; + constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; + constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; + constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; + constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==13) { + constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; + constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; + constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; + constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; + constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; + constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; + constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; + constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; + constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; + constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; + constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; + constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; + constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; + constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; + constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; + constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; + constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; + constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; + constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; + constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; + constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; + constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; + constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; + constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; + constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; + constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; + constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; + constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; + constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; + constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; + constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; + constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; + constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; + constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; + constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; + constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; + constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; + constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; + constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; + constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; + constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; + constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; + constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; + constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; + constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; + constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/src/ker_horner_allw_loop.inc b/src/ker_horner_allw_loop.inc new file mode 100644 index 000000000..953c4618b --- /dev/null +++ b/src/ker_horner_allw_loop.inc @@ -0,0 +1,207 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if (w==2) { + constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; + constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; + constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; + constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; + constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if (w==3) { + constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; + constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; + constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; + constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; + constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; + constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if (w==4) { + constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; + constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; + constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; + constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; + constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; + constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; + constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if (w==5) { + constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; + constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; + constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; + constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; + constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; + constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; + constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; + constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if (w==6) { + constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; + constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; + constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; + constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; + constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; + constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; + constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; + constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; + constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if (w==7) { + constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; + constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; + constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; + constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; + constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; + constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; + constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; + constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; + constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; + constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if (w==8) { + constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; + constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; + constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; + constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; + constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; + constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; + constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; + constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; + constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; + constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; + constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==9) { + constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; + constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; + constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; + constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; + constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; + constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; + constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; + constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; + constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; + constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; + constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if (w==10) { + constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; + constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; + constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; + constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; + constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; + constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; + constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; + constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; + constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; + constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; + constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; + constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if (w==11) { + constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; + constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; + constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; + constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; + constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; + constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; + constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; + constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; + constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; + constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; + constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; + constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; + constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if (w==12) { + constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; + constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; + constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; + constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; + constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; + constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; + constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; + constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; + constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; + constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; + constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; + constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; + constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; + constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if (w==13) { + constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; + constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; + constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; + constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; + constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; + constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; + constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; + constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; + constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; + constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; + constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; + constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; + constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; + constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; + constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if (w==14) { + constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; + constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; + constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; + constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; + constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; + constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; + constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; + constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; + constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; + constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; + constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; + constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; + constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; + constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; + constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; + constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if (w==15) { + constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; + constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; + constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; + constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; + constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; + constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; + constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; + constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; + constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; + constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; + constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; + constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; + constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; + constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; + constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; + constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; + constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else if (w==16) { + constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; + constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; + constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; + constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; + constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; + constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; + constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; + constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; + constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; + constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; + constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; + constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; + constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; + constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; + constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; + constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; + constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; + constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); + } else + printf("width not implemented!\n"); From 12822a218ffbf6c8090a2343440b6d1c5bae81d0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 23 Jul 2024 14:40:42 -0400 Subject: [PATCH 20/39] updated cufinufft to new coeff --- .../contrib/ker_horner_allw_loop.inc | 346 +++++++++--------- .../ker_horner_allw_loop_constexpr.inc | 205 ----------- .../ker_lowupsampfac_horner_allw_loop.inc | 317 ++++++++-------- ...owupsampfac_horner_allw_loop_constexpr.inc | 171 --------- include/cufinufft/spreadinterp.h | 22 +- 5 files changed, 329 insertions(+), 732 deletions(-) delete mode 100644 include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc delete mode 100644 include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc diff --git a/include/cufinufft/contrib/ker_horner_allw_loop.inc b/include/cufinufft/contrib/ker_horner_allw_loop.inc index 953c4618b..1f4c59e2a 100644 --- a/include/cufinufft/contrib/ker_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_horner_allw_loop.inc @@ -2,206 +2,204 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. if (w==2) { - constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; - constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; - constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; - constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; - constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; + constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; + constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; + constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; + for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); } else if (w==3) { - constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; - constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; - constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; - constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; - constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; - constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; + constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; + constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; + constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; + constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; + constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; + constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==4) { - constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; - constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; - constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; - constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; - constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; - constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; - constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; + constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; + constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; + constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; + constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; + constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; + constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; + constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==5) { - constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; - constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; - constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; - constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; - constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; - constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; - constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; - constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; + constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; + constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; + constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; + constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; + constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; + constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; + constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; + constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==6) { - constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; - constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; - constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; - constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; - constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; - constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; - constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; - constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; - constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; + constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; + constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; + constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; + constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; + constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; + constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; + constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; + constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; + constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==7) { - constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; - constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; - constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; - constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; - constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; - constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; - constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; - constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; - constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; - constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; + constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; + constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; + constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; + constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; + constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; + constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; + constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; + constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; + constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; + constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==8) { - constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; - constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; - constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; - constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; - constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; - constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; - constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; - constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; - constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; - constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; - constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; + constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; + constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; + constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; + constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; + constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; + constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; + constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; + constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; + constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==9) { - constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; - constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; - constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; - constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; - constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; - constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; - constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; - constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; - constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; - constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; - constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; + constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; + constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; + constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; + constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; + constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; + constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; + constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; + constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; + constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; + constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; + constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==10) { - constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; - constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; - constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; - constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; - constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; - constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; - constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; - constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; - constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; - constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; - constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; - constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; + constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; + constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; + constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; + constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; + constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; + constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; + constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; + constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; + constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; + constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; + constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; + constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==11) { - constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; - constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; - constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; - constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; - constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; - constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; - constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; - constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; - constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; - constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; - constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; - constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; - constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; + constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; + constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; + constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; + constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; + constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; + constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; + constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; + constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; + constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; + constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; + constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; + constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; + constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==12) { - constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; - constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; - constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; - constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; - constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; - constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; - constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; - constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; - constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; - constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; - constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; - constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; - constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; - constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; + constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; + constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; + constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; + constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; + constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; + constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; + constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; + constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; + constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; + constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; + constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; + constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; + constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; + constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else if (w==13) { - constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; - constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; - constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; - constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; - constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; - constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; - constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; - constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; - constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; - constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; - constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; - constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; - constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; - constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; - constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; + constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; + constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; + constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; + constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; + constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; + constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; + constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; + constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; + constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; + constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; + constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; + constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; + constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; + constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; + constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); } else if (w==14) { - constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; - constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; - constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; - constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; - constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; - constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; - constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; - constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; - constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; - constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; - constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; - constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; - constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; - constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; - constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; + constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; + constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; + constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; + constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; + constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; + constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; + constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; + constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; + constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; + constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; + constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; + constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; + constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; + constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; + constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; + constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); } else if (w==15) { - constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; - constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; - constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; - constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; - constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; - constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; - constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; - constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; - constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; - constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; - constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; - constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; - constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; - constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; - constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; - constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; - constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; + constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; + constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; + constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; + constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; + constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; + constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; + constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; + constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; + constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; + constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; + constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; + constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; + constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; + constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; + constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; + constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; + constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); } else if (w==16) { - constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; - constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; - constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; - constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; - constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; - constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; - constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; - constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; - constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; - constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; - constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; - constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; - constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; - constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; - constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; - constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; - constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; - constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; + constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; + constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; + constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; + constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; + constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; + constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; + constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; + constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; + constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; + constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; + constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; + constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; + constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; + constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; + constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; + constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; + constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; + constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc deleted file mode 100644 index 1f4c59e2a..000000000 --- a/include/cufinufft/contrib/ker_horner_allw_loop_constexpr.inc +++ /dev/null @@ -1,205 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {5.5428559551548406E-01, 5.5428559551548395E-01}; - constexpr FLT c1[] = {7.0481840008800778E-01, -7.0481840008800811E-01}; - constexpr FLT c2[] = {-2.2584311526143548E-02, -2.2584311526143607E-02}; - constexpr FLT c3[] = {-2.5024197515954211E-01, 2.5024197515954211E-01}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {1.7787237246937579E-01, 1.0000000000000013E+00, 1.7787237247678464E-01}; - constexpr FLT c1[] = {3.5966530797581003E-01, -4.2425842671825248E-17, -3.5966530796781060E-01}; - constexpr FLT c2[] = {2.0160576446392536E-01, -3.7666666666667331E-01, 2.0160576447145470E-01}; - constexpr FLT c3[] = {-1.7450587318669351E-02, 2.2939218956436377E-17, 1.7450587325767743E-02}; - constexpr FLT c4[] = {-4.2902993854032963E-02, 6.0475925925925586E-02, -4.2902993846219546E-02}; - constexpr FLT c5[] = {-4.5057857403453909E-03, 6.6232851036457955E-18, 4.5057857475245110E-03}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - constexpr FLT c0[] = {3.9828257752799377E-02, 7.3911656575585805E-01, 7.3911656575585805E-01, 3.9828257752799433E-02}; - constexpr FLT c1[] = {1.0749328817387334E-01, 4.5419700247912287E-01, -4.5419700247912287E-01, -1.0749328817387330E-01}; - constexpr FLT c2[] = {1.0408888748149289E-01, -1.0268333881994456E-01, -1.0268333881994476E-01, 1.0408888748149285E-01}; - constexpr FLT c3[] = {3.7516840869185789E-02, -1.0412335657155622E-01, 1.0412335657155641E-01, -3.7516840869185733E-02}; - constexpr FLT c4[] = {-3.5432868834529888E-03, 2.8903049344237370E-03, 2.8903049344238003E-03, -3.5432868834529676E-03}; - constexpr FLT c5[] = {-5.7512181801490673E-03, 1.0945950376831730E-02, -1.0945950376831654E-02, 5.7512181801490829E-03}; - constexpr FLT c6[] = {-7.3657365672905430E-04, 3.7144674885200340E-04, 3.7144674885207063E-04, -7.3657365672907728E-04}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - constexpr FLT c0[] = {1.0051451410391413E-02, 3.8286382489474308E-01, 1.0000000000000009E+00, 3.8286382489474252E-01, 1.0051451410391420E-02}; - constexpr FLT c1[] = {3.0826052021380446E-02, 3.8431958613457984E-01, -4.7102147373384796E-32, -3.8431958613457951E-01, -3.0826052021380446E-02}; - constexpr FLT c2[] = {3.6562231959204314E-02, 7.8509612097392906E-02, -2.3000000000000059E-01, 7.8509612097392906E-02, 3.6562231959204300E-02}; - constexpr FLT c3[] = {2.0250135419918262E-02, -3.9381037339048602E-02, 1.0193845429304082E-16, 3.9381037339048686E-02, -2.0250135419918248E-02}; - constexpr FLT c4[] = {4.0593041193018580E-03, -1.6067481167759540E-02, 2.4150000000000074E-02, -1.6067481167759530E-02, 4.0593041193018597E-03}; - constexpr FLT c5[] = {-9.2488937959280210E-04, 1.2476700479675494E-03, 1.0406437805617128E-16, -1.2476700479676270E-03, 9.2488937959280405E-04}; - constexpr FLT c6[] = {-5.6059657038176136E-04, 1.2116190166774866E-03, -1.5448333333332675E-03, 1.2116190166775878E-03, -5.6059657038176342E-04}; - constexpr FLT c7[] = {-3.4201716508558499E-05, 2.3137115416428607E-05, 3.6450914717742488E-17, -2.3137115416288715E-05, 3.4201716508574924E-05}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - constexpr FLT c0[] = {2.0875119883113440E-03, 1.5741818314646622E-01, 8.2446837122968764E-01, 8.2446837122968819E-01, 1.5741818314646633E-01, 2.0875119883208737E-03}; - constexpr FLT c1[] = {7.2383827471879086E-03, 2.0903648995439439E-01, 3.2052935784357633E-01, -3.2052935784357606E-01, -2.0903648995439447E-01, -7.2383827471776260E-03}; - constexpr FLT c2[] = {1.0180085126333453E-02, 9.2337811484269047E-02, -1.0253741712233820E-01, -1.0253741712233828E-01, 9.2337811484268964E-02, 1.0180085126343144E-02}; - constexpr FLT c3[] = {7.3669955501269460E-03, 4.9102900025223507E-03, -5.1302324979469405E-02, 5.1302324979469550E-02, -4.9102900025223160E-03, -7.3669955501178214E-03}; - constexpr FLT c4[] = {2.7444270008043898E-03, -8.0004810696544734E-03, 5.2920367975573743E-03, 5.2920367975574090E-03, -8.0004810696544873E-03, 2.7444270008144425E-03}; - constexpr FLT c5[] = {3.2622379114949894E-04, -1.8514138516535197E-03, 3.8520985619445234E-03, -3.8520985619444454E-03, 1.8514138516535119E-03, -3.2622379114026425E-04}; - constexpr FLT c6[] = {-1.2239646122606432E-04, 2.2750660293442782E-04, -1.2702072030317145E-04, -1.2702072030306984E-04, 2.2750660293439860E-04, -1.2239646121695236E-04}; - constexpr FLT c7[] = {-4.6695893922776242E-05, 1.1717219021520763E-04, -1.8098268625859964E-04, 1.8098268625869589E-04, -1.1717219021517810E-04, 4.6695893931711504E-05}; - constexpr FLT c8[] = {-1.5875418082745247E-06, 7.2147850127730698E-07, -7.0930078293142108E-08, -7.0930078245872243E-08, 7.2147850127811706E-07, -1.5875417996312271E-06}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - constexpr FLT c0[] = {4.0677823488318067E-04, 5.5714997521829540E-02, 5.1113018541287825E-01, 1.0000000000000002E+00, 5.1113018541287869E-01, 5.5714997521829561E-02, 4.0677823488475981E-04}; - constexpr FLT c1[] = {1.5569364307494555E-03, 8.9228372765634056E-02, 3.5049603091348180E-01, -1.8840858949353919E-32, -3.5049603091348197E-01, -8.9228372765634029E-02, -1.5569364307477620E-03}; - constexpr FLT c2[] = {2.4904843753404838E-03, 5.4888936725282375E-02, 2.4759577399513382E-02, -1.6428571428571445E-01, 2.4759577399513264E-02, 5.4888936725282340E-02, 2.4904843753420954E-03}; - constexpr FLT c3[] = {2.1552691780265232E-03, 1.3627105791872422E-02, -3.3718114813591167E-02, 1.0435679823191637E-16, 3.3718114813591278E-02, -1.3627105791872396E-02, -2.1552691780250210E-03}; - constexpr FLT c4[] = {1.0735311014902868E-03, -7.2030895675484117E-04, -6.6760503000563741E-03, 1.2656705539358732E-02, -6.6760503000563680E-03, -7.2030895675483119E-04, 1.0735311014919520E-03}; - constexpr FLT c5[] = {2.8413019973530626E-04, -1.1175797418592351E-03, 1.3906361031252640E-03, 1.0099777883094147E-16, -1.3906361031252017E-03, 1.1175797418592505E-03, -2.8413019973377792E-04}; - constexpr FLT c6[] = {1.6363160465889005E-05, -1.5802085209242310E-04, 4.4431051893374396E-04, -6.0985626028865780E-04, 4.4431051893376408E-04, -1.5802085209243416E-04, 1.6363160467394339E-05}; - constexpr FLT c7[] = {-1.2513684117291295E-05, 2.9105578584781478E-05, -2.8835295309364819E-05, 6.9093005849597210E-17, 2.8835295309456306E-05, -2.9105578584752466E-05, 1.2513684118770622E-05}; - constexpr FLT c8[] = {-3.2859430043343403E-06, 9.3570096164232078E-06, -1.7015821249906871E-05, 2.0688046128660197E-05, -1.7015821249876886E-05, 9.3570096164290557E-06, -3.2859430029058764E-06}; - constexpr FLT c9[] = {-1.5030958477935016E-08, -9.3540219413709317E-08, 1.3079704875560537E-07, 3.0755088144886539E-17, -1.3079704870024676E-07, 9.3540219430316894E-08, 1.5030959705830809E-08}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - constexpr FLT c0[] = {7.5442178667264049E-05, 1.7659090182402852E-02, 2.6112828482312650E-01, 8.6561421087578294E-01, 8.6561421087578294E-01, 2.6112828482312650E-01, 1.7659090182402856E-02, 7.5442178667263913E-05}; - constexpr FLT c1[] = {3.1361556564941527E-04, 3.2518751351035657E-02, 2.4295266212395961E-01, 2.5083142126627195E-01, -2.5083142126627200E-01, -2.4295266212395961E-01, -3.2518751351035664E-02, -3.1361556564941506E-04}; - constexpr FLT c2[] = {5.5627094085228170E-04, 2.4604803324737457E-02, 6.5902977410162822E-02, -9.1064379250067565E-02, -9.1064379250067648E-02, 6.5902977410162836E-02, 2.4604803324737447E-02, 5.5627094085228149E-04}; - constexpr FLT c3[] = {5.5053208919074741E-04, 9.2359485489686977E-03, -6.2169545154249764E-03, -3.1386277864020387E-02, 3.1386277864020692E-02, 6.2169545154250301E-03, -9.2359485489686925E-03, -5.5053208919074741E-04}; - constexpr FLT c4[] = {3.3122072653963820E-04, 1.3353118718124376E-03, -5.9878504390516807E-03, 4.3217905833729843E-03, 4.3217905833729184E-03, -5.9878504390516564E-03, 1.3353118718124411E-03, 3.3122072653963842E-04}; - constexpr FLT c5[] = {1.2112223749399388E-04, -2.3174709024353528E-04, -5.1773322458159945E-04, 1.8691284471382664E-03, -1.8691284471382276E-03, 5.1773322458165388E-04, 2.3174709024353332E-04, -1.2112223749399391E-04}; - constexpr FLT c6[] = {2.3288943339077962E-05, -1.1810885265513022E-04, 2.1380000655379686E-04, -1.1905274322668279E-04, -1.1905274322667877E-04, 2.1380000655378596E-04, -1.1810885265513386E-04, 2.3288943339077766E-05}; - constexpr FLT c7[] = {8.7290223704935849E-08, -9.9551635569432461E-06, 3.9042123573714734E-05, -7.0647330846704962E-05, 7.0647330846826175E-05, -3.9042123573667747E-05, 9.9551635569490195E-06, -8.7290223704824623E-08}; - constexpr FLT c8[] = {-1.0444417486661213E-06, 2.8837147790326586E-06, -3.9445588398358951E-06, 1.9505656879624058E-06, 1.9505656880227840E-06, -3.9445588398203690E-06, 2.8837147790369691E-06, -1.0444417486660073E-06}; - constexpr FLT c9[] = {-1.9601350641688945E-07, 6.2981383505868899E-07, -1.3252363384761618E-06, 1.9071649677058813E-06, -1.9071649677363285E-06, 1.3252363385149127E-06, -6.2981383505419114E-07, 1.9601350641697053E-07}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3445576990655693E-05, 5.1377966678943553E-03, 1.1569392196071671E-01, 5.9595989228910695E-01, 1.0000000000000004E+00, 5.9595989228910784E-01, 1.1569392196071673E-01, 5.1377966678943874E-03, 1.3445576990655681E-05}; - constexpr FLT c1[] = {6.0003223623206657E-05, 1.0569385595664990E-02, 1.3202059711663530E-01, 3.1241329121161582E-01, -8.4851685343650422E-17, -3.1241329121161615E-01, -1.3202059711663522E-01, -1.0569385595665032E-02, -6.0003223623206596E-05}; - constexpr FLT c2[] = {1.1601811379064824E-04, 9.2861699099147151E-03, 5.4760895870332324E-02, -2.7420112488894219E-04, -1.2777777777777805E-01, -2.7420112488935430E-04, 5.4760895870332296E-02, 9.2861699099147359E-03, 1.1601811379064817E-04}; - constexpr FLT c3[] = {1.2783089927061688E-04, 4.4048543606096807E-03, 6.4505427512762566E-03, -2.6627297241817574E-02, 1.0570032264240285E-16, 2.6627297241817935E-02, -6.4505427512762245E-03, -4.4048543606096877E-03, -1.2783089927061688E-04}; - constexpr FLT c4[] = {8.8459828362140127E-05, 1.1147546008569559E-03, -2.1200589329645782E-03, -2.9677441441083273E-03, 7.7692043895744413E-03, -2.9677441441080211E-03, -2.1200589329645678E-03, 1.1147546008569583E-03, 8.8459828362140168E-05}; - constexpr FLT c5[] = {3.9567294647305465E-05, 8.1817980646548672E-05, -7.2116754318327786E-04, 1.0390038161997466E-03, 1.3960675422467541E-16, -1.0390038161998867E-03, 7.2116754318328556E-04, -8.1817980646550122E-05, -3.9567294647305431E-05}; - constexpr FLT c6[] = {1.1032857092605887E-05, -3.4254477931955853E-05, -1.3557143976035256E-05, 1.8667778536557664E-04, -2.9974999576614188E-04, 1.8667778536546106E-04, -1.3557143976042615E-05, -3.4254477931959885E-05, 1.1032857092605841E-05}; - constexpr FLT c7[] = {1.5345430093717796E-06, -9.9308189188274098E-06, 2.3762810604639151E-05, -2.4017602201954516E-05, 1.1627785359675844E-17, 2.4017602202115669E-05, -2.3762810604628780E-05, 9.9308189188319669E-06, -1.5345430093718216E-06}; - constexpr FLT c8[] = {-8.1737159283255726E-08, -4.1540916378247392E-07, 2.6668107554223020E-06, -6.3261434127908313E-06, 8.2578681449311880E-06, -6.3261434126076934E-06, 2.6668107554440373E-06, -4.1540916378676467E-07, -8.1737159283249333E-08}; - constexpr FLT c9[] = {-7.3256982980608342E-08, 2.3321978963880019E-07, -4.0030411105333760E-07, 3.4388260968054864E-07, 6.5677795522570459E-17, -3.4388260990751890E-07, 4.0030411105333760E-07, -2.3321978963499429E-07, 7.3256982980640781E-08}; - constexpr FLT c10[] = {-1.0121400696579195E-08, 3.6191328862414928E-08, -8.7258577118961372E-08, 1.4622014477867198E-07, -1.7333902174790525E-07, 1.4622014483401952E-07, -8.7258577100106683E-08, 3.6191328859901120E-08, -1.0121400696606260E-08}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - constexpr FLT c0[] = {2.3186292807626266E-06, 1.3952040327729876E-03, 4.5894237568906843E-02, 3.4666431215091636E-01, 8.9110862394332080E-01, 8.9110862394332024E-01, 3.4666431215091614E-01, 4.5894237568906843E-02, 1.3952040327729804E-03, 2.3186292807626329E-06}; - constexpr FLT c1[] = {1.1010978063160391E-05, 3.1454190365986022E-03, 6.0943215953720313E-02, 2.5074802988370321E-01, 2.0598750885032702E-01, -2.0598750885032710E-01, -2.5074802988370315E-01, -6.0943215953720306E-02, -3.1454190365985909E-03, -1.1010978063160380E-05}; - constexpr FLT c2[] = {2.2925449299630732E-05, 3.1050615653861980E-03, 3.2337657329423494E-02, 4.4760550762170469E-02, -8.0226193254406428E-02, -8.0226193254406289E-02, 4.4760550762170441E-02, 3.2337657329423480E-02, 3.1050615653861868E-03, 2.2925449299630681E-05}; - constexpr FLT c3[] = {2.7622345748507540E-05, 1.7317590416004974E-03, 7.6620063086756569E-03, -9.8393115612840278E-03, -2.1163068654269049E-02, 2.1163068654269510E-02, 9.8393115612841128E-03, -7.6620063086756491E-03, -1.7317590416004913E-03, -2.7622345748507479E-05}; - constexpr FLT c4[] = {2.1363614860997117E-05, 5.7553475552091617E-04, 1.4813144535930287E-04, -4.1113061120761924E-03, 3.3662735809591683E-03, 3.3662735809590794E-03, -4.1113061120762826E-03, 1.4813144535930759E-04, 5.7553475552091368E-04, 2.1363614860997080E-05}; - constexpr FLT c5[] = {1.1063475580065299E-05, 1.0180053030149723E-04, -3.4137441280837177E-04, -4.9828659222651745E-05, 1.0442648308817235E-03, -1.0442648308817467E-03, 4.9828659222713965E-05, 3.4137441280837177E-04, -1.0180053030149541E-04, -1.1063475580065281E-05}; - constexpr FLT c6[] = {3.8359011440648869E-06, 1.3049698816919587E-06, -6.3791463619208982E-05, 1.4528730872072194E-04, -8.6630472952355992E-05, -8.6630472952398913E-05, 1.4528730872073633E-04, -6.3791463619214471E-05, 1.3049698816901833E-06, 3.8359011440648767E-06}; - constexpr FLT c7[] = {8.3366418668164326E-07, -3.5785601754616355E-06, 2.4539930904858821E-06, 1.2754336575782058E-05, -3.3000414536039571E-05, 3.3000414536273711E-05, -1.2754336575693992E-05, -2.4539930904800897E-06, 3.5785601754627781E-06, -8.3366418668163871E-07}; - constexpr FLT c8[] = {8.0572098823818712E-08, -6.8352224328357488E-07, 2.0695541423376112E-06, -2.9709579576770532E-06, 1.5005770225996294E-06, 1.5005770226481292E-06, -2.9709579578116679E-06, 2.0695541423438809E-06, -6.8352224328404986E-07, 8.0572098823810798E-08}; - constexpr FLT c9[] = {-1.0412910456843575E-08, -3.6228831474008107E-09, 1.3932530225640674E-07, -4.5071262434444286E-07, 7.5149884418348562E-07, -7.5149884428313110E-07, 4.5071262441364111E-07, -1.3932530225017888E-07, 3.6228831478332996E-09, 1.0412910456861821E-08}; - constexpr FLT c10[] = {-4.4291858216944146E-09, 1.5904364893350153E-08, -3.2603275106346107E-08, 3.8190045632066571E-08, -1.7631718176528265E-08, -1.7631718292171639E-08, 3.8190045621381707E-08, -3.2603275098803994E-08, 1.5904364893978648E-08, -4.4291858217073890E-09}; - constexpr FLT c11[] = {-4.4040059170580565E-10, 1.7857872825180656E-09, -4.9203237617335969E-09, 9.5125262125165431E-09, -1.3157194779492521E-08, 1.3157194812996001E-08, -9.5125262191888681E-09, 4.9203237596041585E-09, -1.7857872834763311E-09, 4.4040059170802652E-10}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - constexpr FLT c0[] = {3.8884809238313434E-07, 3.5785567372179951E-04, 1.6654951019551330E-02, 1.7692785324424570E-01, 6.5593328211813162E-01, 9.9999999999999978E-01, 6.5593328211813129E-01, 1.7692785324424565E-01, 1.6654951019551330E-02, 3.5785567372179962E-04, 3.8884809238312539E-07}; - constexpr FLT c1[] = {1.9516358260453364E-06, 8.7214421096705593E-04, 2.4929466432368100E-02, 1.5885079249667189E-01, 2.7894884556454935E-01, 9.4204294746769595E-33, -2.7894884556454941E-01, -1.5885079249667189E-01, -2.4929466432368097E-02, -8.7214421096705604E-04, -1.9516358260453169E-06}; - constexpr FLT c2[] = {4.3353827605930511E-06, 9.4705645354715550E-04, 1.5700144896729017E-02, 4.8428271550326758E-02, -1.2807080799297165E-02, -1.0454545454545448E-01, -1.2807080799297061E-02, 4.8428271550326821E-02, 1.5700144896729006E-02, 9.4705645354715518E-04, 4.3353827605930215E-06}; - constexpr FLT c3[] = {5.6395387871289846E-06, 5.9760549110825473E-04, 5.0911332059142295E-03, 1.6690038662948304E-03, -2.1030028251697912E-02, 1.4335617874817167E-16, 2.1030028251698141E-02, -1.6690038662947660E-03, -5.0911332059142200E-03, -5.9760549110825429E-04, -5.6395387871289508E-06}; - constexpr FLT c4[] = {4.7836299264887200E-06, 2.3732554180006408E-04, 7.1846854433598795E-04, -2.2660086673713248E-03, -1.3190061226035158E-03, 5.2488730277989188E-03, -1.3190061226033569E-03, -2.2660086673713374E-03, 7.1846854433598557E-04, 2.3732554180006421E-04, 4.7836299264886963E-06}; - constexpr FLT c5[] = {2.7801202330030064E-06, 5.8401836435976300E-05, -5.7255962675850168E-05, -4.1058481683291448E-04, 7.4543249761827859E-04, 6.7099534430837577E-17, -7.4543249761823186E-04, 4.1058481683291448E-04, 5.7255962675853089E-05, -5.8401836435976178E-05, -2.7801202330029924E-06}; - constexpr FLT c6[] = {1.1248609988572041E-06, 7.1593996360419040E-06, -3.7923443960739119E-05, 2.8219312687371359E-05, 8.5797383067823588E-05, -1.6875309167105302E-04, 8.5797383067779691E-05, 2.8219312687392853E-05, -3.7923443960740034E-05, 7.1593996360418057E-06, 1.1248609988571978E-06}; - constexpr FLT c7[] = {3.1074712008817516E-07, -3.7942806006679305E-07, -4.2327710785708026E-06, 1.4518421536643064E-05, -1.6373413879605298E-05, 3.0222646636983358E-17, 1.6373413879621934E-05, -1.4518421536591986E-05, 4.2327710785753580E-06, 3.7942806006705484E-07, -3.1074712008817235E-07}; - constexpr FLT c8[] = {5.3160526822194444E-08, -2.9438470061321741E-07, 4.4816653817789122E-07, 4.9835853873945607E-07, -2.6602444110833864E-06, 3.9090815375281113E-06, -2.6602444110225165E-06, 4.9835853874269618E-07, 4.4816653818193273E-07, -2.9438470061323123E-07, 5.3160526822193583E-08}; - constexpr FLT c9[] = {3.1778958300854393E-09, -3.9044067083483707E-08, 1.4726158788365547E-07, -2.7451209287062293E-07, 2.4544112217999958E-07, 8.6199548859978872E-18, -2.4544112207758621E-07, 2.7451209285678326E-07, -1.4726158788296347E-07, 3.9044067083624268E-08, -3.1778958300829052E-09}; - constexpr FLT c10[] = {-8.6163117991617490E-10, 1.2292710054271969E-09, 4.9928263052430922E-09, -2.5746199362556884E-08, 5.5054682151312924E-08, -6.9606951358406722E-08, 5.5054682230504105E-08, -2.5746199365699604E-08, 4.9928263093284604E-09, 1.2292710054468060E-09, -8.6163117991862728E-10}; - constexpr FLT c11[] = {-2.3293080872726303E-10, 9.3461130390718653E-10, -2.2220140857286656E-09, 3.2420144232604506E-09, -2.5573586459741160E-09, -3.4362247560151687E-17, 2.5573586170134590E-09, -3.2420144222311963E-09, 2.2220140843090244E-09, -9.3461130382733279E-10, 2.3293080872885788E-10}; - constexpr FLT c12[] = {-1.6776727231079557E-11, 7.5440974150049303E-11, -2.3911386677196792E-10, 5.3207180787495740E-10, -8.5057641018270776E-10, 9.9272876082686339E-10, -8.5057644693357476E-10, 5.3207181195839291E-10, -2.3911386485786361E-10, 7.5440974126123504E-11, -1.6776727231328710E-11}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - constexpr FLT c0[] = {6.3667715563015689E-08, 8.7461142088576888E-05, 5.6146669497086589E-03, 8.1271316412301370E-02, 4.1627261402765736E-01, 9.0846375182673755E-01, 9.0846375182673755E-01, 4.1627261402765736E-01, 8.1271316412301550E-02, 5.6146669497086719E-03, 8.7461142088576929E-05, 6.3667715563034801E-08}; - constexpr FLT c1[] = {3.3587389488258588E-07, 2.2809471090022899E-04, 9.2744480587562007E-03, 8.5676487647659991E-02, 2.4720659158040625E-01, 1.7472997738462001E-01, -1.7472997738461990E-01, -2.4720659158040617E-01, -8.5676487647660143E-02, -9.2744480587562180E-03, -2.2809471090022899E-04, -3.3587389488256608E-07}; - constexpr FLT c2[] = {7.9035220764954472E-07, 2.6846594761214740E-04, 6.6557324960729147E-03, 3.4792641812076718E-02, 2.9454899103693762E-02, -7.1172529707069221E-02, -7.1172529707069207E-02, 2.9454899103693671E-02, 3.4792641812076690E-02, 6.6557324960729242E-03, 2.6846594761214740E-04, 7.9035220764956886E-07}; - constexpr FLT c3[] = {1.0993606197695965E-06, 1.8716155179384050E-04, 2.6329045000561364E-03, 5.3754303637600113E-03, -1.0591878410592502E-02, -1.5228395084945664E-02, 1.5228395084945803E-02, 1.0591878410592646E-02, -5.3754303637599376E-03, -2.6329045000561364E-03, -1.8716155179384044E-04, -1.0993606197695836E-06}; - constexpr FLT c4[] = {1.0091198513153346E-06, 8.4812954286468477E-05, 5.7431140218944460E-04, -5.0274672420766203E-04, -2.8008958990917627E-03, 2.6435090762445433E-03, 2.6435090762445819E-03, -2.8008958990918187E-03, -5.0274672420767580E-04, 5.7431140218944276E-04, 8.4812954286468423E-05, 1.0091198513153598E-06}; - constexpr FLT c5[] = {6.4507244019416584E-07, 2.5481132674301279E-05, 4.2795619387511420E-05, -3.0197159708156643E-04, 1.1080610219049720E-04, 6.4144454802694492E-04, -6.4144454802681275E-04, -1.1080610219045053E-04, 3.0197159708157808E-04, -4.2795619387511908E-05, -2.5481132674301286E-05, -6.4507244019414964E-07}; - constexpr FLT c6[] = {2.9426545129495891E-07, 4.7724106401925034E-06, -1.1001642128368358E-05, -2.6869692251292103E-05, 9.4483235217708846E-05, -6.1678458203322752E-05, -6.1678458203283029E-05, 9.4483235217638725E-05, -2.6869692251319154E-05, -1.1001642128368348E-05, 4.7724106401924525E-06, 2.9426545129497845E-07}; - constexpr FLT c7[] = {9.5799843879057487E-08, 3.7784160107136394E-07, -3.2256313018476217E-06, 5.0144058082843800E-06, 3.4886031174309006E-06, -1.7411974954245794E-05, 1.7411974954244114E-05, -3.4886031173677615E-06, -5.0144058082412084E-06, 3.2256313018490718E-06, -3.7784160107127161E-07, -9.5799843879039593E-08}; - constexpr FLT c8[] = {2.1473864761677802E-08, -5.7414008446850441E-08, -2.0134799316446491E-07, 1.1145247706131597E-06, -1.8840465966107854E-06, 1.0067804561094662E-06, 1.0067804560969447E-06, -1.8840465965985945E-06, 1.1145247706194121E-06, -2.0134799316567892E-07, -5.7414008446903526E-08, 2.1473864761695718E-08}; - constexpr FLT c9[] = {2.8867786924320735E-09, -2.0015791402048098E-08, 4.5306507660172584E-08, -7.8859059608423767E-09, -1.5755151471717741E-07, 3.4270221893522085E-07, -3.4270221891584534E-07, 1.5755151474485673E-07, 7.8859059608423767E-09, -4.5306507656885666E-08, 2.0015791402102159E-08, -2.8867786924173336E-09}; - constexpr FLT c10[] = {6.9986758892026879E-11, -1.8486004428526375E-09, 8.7658205612213605E-09, -2.0364661368255434E-08, 2.5396405431717686E-08, -1.2044441164754235E-08, -1.2044441145898965E-08, 2.5396405393379069E-08, -2.0364661337458944E-08, 8.7658205594930229E-09, -1.8486004428624741E-09, 6.9986758906941889E-11}; - constexpr FLT c11[] = {-5.6296594747629561E-11, 1.4066781276164117E-10, 4.6947620156299098E-11, -1.1526063766721083E-09, 3.3027593515457814E-09, -5.2174001597719162E-09, 5.2174001336505757E-09, -3.3027593563725673E-09, 1.1526063504088099E-09, -4.6947618665684182E-11, -1.4066781273945818E-10, 5.6296594761077256E-11}; - constexpr FLT c12[] = {-1.0870401168253040E-11, 4.8044744351982426E-11, -1.3004175788815863E-10, 2.2570502267192305E-10, -2.4006684875388499E-10, 1.0598000131166063E-10, 1.0597991964307358E-10, -2.4006682833673746E-10, 2.2570504206821193E-10, -1.3004176149306233E-10, 4.8044744304130286E-11, -1.0870401156071839E-11}; - constexpr FLT c13[] = {-4.7539080498592749E-13, 2.6787995976616703E-12, -1.0000145739993567E-11, 2.5777400861531429E-11, -4.7463672955972831E-11, 6.4012227921839136E-11, -6.4012266007267373E-11, 4.7463669782187146E-11, -2.5777397687745743E-11, 1.0000149112140858E-11, -2.6787995744161696E-12, 4.7539081133001201E-13}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - constexpr FLT c0[] = {1.0208956054983696E-08, 2.0506572462261995E-05, 1.7784497194617906E-03, 3.4214490279693019E-02, 2.3443634373410047E-01, 7.0049708882252804E-01, 9.9999999999999956E-01, 7.0049708882252670E-01, 2.3443634373410041E-01, 3.4214490279692922E-02, 1.7784497194617906E-03, 2.0506572462261785E-05, 1.0208956054983676E-08}; - constexpr FLT c1[] = {5.6353468219321995E-08, 5.6780128053894686E-05, 3.1934841481628326E-03, 4.0941461360716927E-02, 1.7436810648693357E-01, 2.5085467225681696E-01, -6.3638764007737755E-17, -2.5085467225681662E-01, -1.7436810648693341E-01, -4.0941461360716816E-02, -3.1934841481628326E-03, -5.6780128053894232E-05, -5.6353468219321988E-08}; - constexpr FLT c2[] = {1.3966266158866427E-07, 7.1655019336418755E-05, 2.5459504018621182E-03, 2.0160236969440644E-02, 4.0770064165298429E-02, -1.9317276988534509E-02, -8.8461538461538661E-02, -1.9317276988534381E-02, 4.0770064165298395E-02, 2.0160236969440602E-02, 2.5459504018621160E-03, 7.1655019336418200E-05, 1.3966266158866422E-07}; - constexpr FLT c3[] = {2.0618605552701903E-07, 5.4306747658367697E-05, 1.1637911071900936E-03, 4.7784706844645319E-03, -1.2004184173788884E-03, -1.6862510515565966E-02, 1.4394808111083350E-16, 1.6862510515566146E-02, 1.2004184173788636E-03, -4.7784706844645379E-03, -1.1637911071900920E-03, -5.4306747658367331E-05, -2.0618605552701909E-07}; - constexpr FLT c4[] = {2.0277547837406105E-07, 2.7328509487415503E-05, 3.2236608098850310E-04, 3.0859705461356495E-04, -2.0254394973524947E-03, -5.2398574644553877E-04, 3.7818616294949463E-03, -5.2398574644547762E-04, -2.0254394973524895E-03, 3.0859705461357378E-04, 3.2236608098850327E-04, 2.7328509487415384E-05, 2.0277547837406108E-07}; - constexpr FLT c5[] = {1.4058372037094490E-07, 9.4685595066536085E-06, 4.8682874512158502E-05, -1.1575111217134651E-04, -2.1811605515759046E-04, 5.4056763477041119E-04, 1.1213866287069097E-16, -5.4056763477029453E-04, 2.1811605515769156E-04, 1.1575111217135234E-04, -4.8682874512158861E-05, -9.4685595066535949E-06, -1.4058372037094498E-07}; - constexpr FLT c6[] = {7.0755520230584385E-08, 2.2298625886400277E-06, 7.8375383352022143E-07, -2.8394470622676381E-05, 3.5771256766257562E-05, 4.1631950912211130E-05, -1.0418619302467684E-04, 4.1631950912333557E-05, 3.5771256766183768E-05, -2.8394470622671916E-05, 7.8375383351933331E-07, 2.2298625886400294E-06, 7.0755520230584346E-08}; - constexpr FLT c7[] = {2.6111186487625245E-08, 3.2044561720738826E-07, -1.2220373462313589E-06, -8.5793794342228941E-07, 8.3299507234112700E-06, -1.0956754351178954E-05, 9.4610283796409485E-17, 1.0956754351115859E-05, -8.3299507234215327E-06, 8.5793794342144989E-07, 1.2220373462321896E-06, -3.2044561720741346E-07, -2.6111186487625302E-08}; - constexpr FLT c8[] = {6.9838095920570498E-09, 1.2796250155222958E-08, -2.1971713837900942E-07, 5.2791981730307194E-07, -1.4622692107334488E-07, -1.2222183756556175E-06, 2.0809248310569844E-06, -1.2222183756925741E-06, -1.4622692099063203E-07, 5.2791981730006307E-07, -2.1971713837856465E-07, 1.2796250155283016E-08, 6.9838095920570937E-09}; - constexpr FLT c9[] = {1.2845897306280646E-09, -5.2304801922802769E-09, -5.0548716982175665E-09, 6.7539942924545603E-08, -1.6027276234256162E-07, 1.5655092165632365E-07, 4.6828140259346451E-17, -1.5655092173659360E-07, 1.6027276234809749E-07, -6.7539942912781904E-08, 5.0548716984338105E-09, 5.2304801922379145E-09, -1.2845897306280857E-09}; - constexpr FLT c10[] = {1.3345700642131601E-10, -1.1551704392349950E-09, 3.4412362345673782E-09, -3.2850871078054311E-09, -6.1855158542452699E-09, 2.3119925642302808E-08, -3.2145944181567604E-08, 2.3119926027259106E-08, -6.1855159240088862E-09, -3.2850871247748739E-09, 3.4412362345280933E-09, -1.1551704391858975E-09, 1.3345700642134581E-10}; - constexpr FLT c11[] = {-1.9694481417663767E-12, -7.0630732018717419E-11, 4.4161967766895751E-10, -1.2581280884757252E-09, 2.0087583285653241E-09, -1.6557203488425082E-09, 5.7014219382328511E-17, 1.6557200410648860E-09, -2.0087583339599462E-09, 1.2581281082796833E-09, -4.4161967789965090E-10, 7.0630731978790794E-11, 1.9694481417229703E-12}; - constexpr FLT c12[] = {-3.1122514901291979E-12, 1.0235548893351873E-11, -1.0076717787418374E-11, -3.6278872085836478E-11, 1.6235812713334426E-10, -3.2356766327511469E-10, 4.0014573853281197E-10, -3.2356772044312440E-10, 1.6235817511363862E-10, -3.6278891226911122E-11, -1.0076717627909611E-11, 1.0235548938213992E-11, -3.1122514900941893E-12}; - constexpr FLT c13[] = {-4.4521627553052389E-13, 2.1830423195977186E-12, -6.6494700502871459E-12, 1.3364548102385267E-11, -1.7572530897780217E-11, 1.3087527392509343E-11, -1.4854086432767967E-17, -1.3087613084722882E-11, 1.7572508681280409E-11, -1.3364552466340585E-11, 6.6494701742631489E-12, -2.1830423513665695E-12, 4.4521627553052389E-13}; - constexpr FLT c14[] = {-1.1331825591762625E-14, 7.5442537823437382E-14, -3.5473113067901070E-13, 1.0827924393926043E-12, -2.3053993601726267E-12, 3.5752731472827676E-12, -4.1288118242378826E-12, 3.5755029357484062E-12, -2.3054273074184593E-12, 1.0827837446939142E-12, -3.5473109186339628E-13, 7.5442574213081941E-14, -1.1331825564518091E-14}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.6070755785071491E-09, 4.6371263117318300E-06, 5.3392892770691468E-04, 1.3380163586766329E-02, 1.1960061568997656E-01, 4.7332499268789285E-01, 9.2104360429933863E-01, 9.2104360429933885E-01, 4.7332499268789302E-01, 1.1960061568997683E-01, 1.3380163586766332E-02, 5.3392892770691837E-04, 4.6371263117318342E-06, 1.6070755785075502E-09}; - constexpr FLT c1[] = {9.2475302076758674E-09, 1.3546865389183953E-05, 1.0306349751547578E-03, 1.7767594411827761E-02, 1.0518000824290019E-01, 2.3882936521395404E-01, 1.5170179567585843E-01, -1.5170179567585837E-01, -2.3882936521395398E-01, -1.0518000824290036E-01, -1.7767594411827754E-02, -1.0306349751547613E-03, -1.3546865389183977E-05, -9.2475302076757731E-09}; - constexpr FLT c2[] = {2.4024402573674993E-08, 1.8178651135370012E-05, 8.9712289901830596E-04, 1.0104692380253478E-02, 3.4193348251104483E-02, 1.8533380680638794E-02, -6.3746746886473832E-02, -6.3746746886473860E-02, 1.8533380680638745E-02, 3.4193348251104413E-02, 1.0104692380253471E-02, 8.9712289901830889E-04, 1.8178651135370046E-05, 2.4024402573675768E-08}; - constexpr FLT c3[] = {3.7419288907183495E-08, 1.4804264337309617E-05, 4.5929141335173144E-04, 3.0552592910038168E-03, 3.3079403387824323E-03, -1.0247716289024879E-02, -1.1480323948535117E-02, 1.1480323948535463E-02, 1.0247716289025027E-02, -3.3079403387824271E-03, -3.0552592910038120E-03, -4.5929141335173334E-04, -1.4804264337309643E-05, -3.7419288907183766E-08}; - constexpr FLT c4[] = {3.9124194363163287E-08, 8.1265227753122953E-06, 1.4975407030324905E-04, 4.4789439277602894E-04, -7.9407521150521383E-04, -1.9254008995687184E-03, 2.1136619999320748E-03, 2.1136619999320141E-03, -1.9254008995687132E-03, -7.9407521150514292E-04, 4.4789439277602867E-04, 1.4975407030325005E-04, 8.1265227753123105E-06, 3.9124194363164148E-08}; - constexpr FLT c5[] = {2.9113992252245385E-08, 3.1458937074171823E-06, 3.0585266291431613E-05, -6.5135387342551234E-06, -2.3196510408355524E-04, 1.5778347828067563E-04, 4.2181913759748168E-04, -4.2181913759742725E-04, -1.5778347828060562E-04, 2.3196510408355524E-04, 6.5135387342551234E-06, -3.0585266291432040E-05, -3.1458937074171887E-06, -2.9113992252245408E-08}; - constexpr FLT c6[] = {1.5927753226313472E-08, 8.6591441391883797E-07, 3.1186030532599549E-06, -1.4256326863802477E-05, -6.9192418278078229E-06, 6.1786486497582421E-05, -4.4611361914704291E-05, -4.4611361914610670E-05, 6.1786486497541994E-05, -6.9192418278024798E-06, -1.4256326863804276E-05, 3.1186030532598494E-06, 8.6591441391883161E-07, 1.5927753226313945E-08}; - constexpr FLT c7[] = {6.5072355972925020E-09, 1.6321871905299654E-07, -1.6208737249918160E-07, -2.0005919851675986E-06, 4.6289117401651821E-06, 1.5738407907104777E-07, -1.0033756087313552E-05, 1.0033756087535249E-05, -1.5738407898383816E-07, -4.6289117402341052E-06, 2.0005919851709152E-06, 1.6208737249923451E-07, -1.6321871905299225E-07, -6.5072355972922787E-09}; - constexpr FLT c8[] = {1.9857214221989366E-09, 1.7788899565181922E-08, -1.0133541198312604E-07, 4.4566342395340293E-08, 5.3564828266574526E-07, -1.1695093255338883E-06, 6.7085595118984104E-07, 6.7085595114069746E-07, -1.1695093255217181E-06, 5.3564828276835377E-07, 4.4566342396873204E-08, -1.0133541198326502E-07, 1.7788899565180526E-08, 1.9857214221992563E-09}; - constexpr FLT c9[] = {4.4289508956510332E-10, -2.3397558741938982E-11, -1.2203541602658680E-08, 4.1555456455006879E-08, -4.0387396856849884E-08, -5.2822132653130956E-08, 1.7383889351097292E-07, -1.7383889353173241E-07, 5.2822132672506464E-08, 4.0387396834706444E-08, -4.1555456455698865E-08, 1.2203541602950610E-08, 2.3397558742361335E-11, -4.4289508956485253E-10}; - constexpr FLT c10[] = {6.7195187479843226E-11, -3.6781600571171619E-10, 1.8909214083296717E-10, 3.2074788122994124E-09, -1.0777792237807384E-08, 1.5287295377979802E-08, -7.6060392723093131E-09, -7.6060391755201933E-09, 1.5287295398091755E-08, -1.0777792217695420E-08, 3.2074788146563205E-09, 1.8909214044014493E-10, -3.6781600571662634E-10, 6.7195187480068943E-11}; - constexpr FLT c11[] = {5.1753158905822061E-12, -5.7459004384753609E-11, 2.1373772914288248E-10, -3.3474981614755248E-10, -5.5056523013581392E-11, 1.1984997345151211E-09, -2.3401534609898206E-09, 2.3401534737665714E-09, -1.1984997515507915E-09, 5.5056487167718091E-11, 3.3474981678638774E-10, -2.1373772871699109E-10, 5.7459004393903842E-11, -5.1753158903480283E-12}; - constexpr FLT c12[] = {-3.4295334316135217E-13, -1.9669734020395281E-12, 1.8829710516667924E-11, -6.6063898621267923E-11, 1.2987243021035191E-10, -1.4723142988261286E-10, 6.6816662742079877E-11, 6.6816650491789053E-11, -1.4723143192432656E-10, 1.2987247614892944E-10, -6.6063898621269021E-11, 1.8829709886607818E-11, -1.9669734162457477E-12, -3.4295334295692199E-13}; - constexpr FLT c13[] = {-1.4925032356367256E-13, 5.9101412900182951E-13, -1.0473414103260276E-12, -3.4168877521962931E-13, 6.3681343308181771E-12, -1.6773485918159645E-11, 2.5499676364679485E-11, -2.5499722384571941E-11, 1.6773473223016897E-11, -6.3681501997466111E-12, 3.4168877521962931E-13, 1.0473414909104298E-12, -5.9101412551500433E-13, 1.4925032367414924E-13}; - constexpr FLT c14[] = {-1.6512890188764807E-14, 8.8250735109913167E-14, -3.0062084749515021E-13, 6.8819378623923325E-13, -1.0710378278007934E-12, 1.0658930503703208E-12, -4.5535006559156473E-13, -4.5529417109990688E-13, 1.0659116818675222E-12, -1.0710247857527394E-12, 6.8819549412647750E-13, -3.0062091542248455E-13, 8.8250729803090660E-14, -1.6512890092223385E-14}; - constexpr FLT c15[] = {1.6573977440105294E-16, 1.3350735743743382E-15, -1.0198606577404851E-14, 3.9099634678793536E-14, -9.7801981044810947E-14, 1.7461338478760738E-13, -2.3137912816883565E-13, 2.3133990246879147E-13, -1.7463221312362809E-13, 9.7795403196649327E-14, -3.9099513984331611E-14, 1.0198764988885690E-14, -1.3350660309704511E-15, -1.6573967886539614E-16}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.4886236238313534E-10, 1.0156314710024854E-06, 1.5297772142853732E-04, 4.9110296377727252E-03, 5.6121982134094042E-02, 2.8670951404936740E-01, 7.3488453954210731E-01, 1.0000000000000018E+00, 7.3488453954210708E-01, 2.8670951404936784E-01, 5.6121982134094188E-02, 4.9110296377727321E-03, 1.5297772142853737E-04, 1.0156314710024854E-06, 2.4886236238313394E-10}; - constexpr FLT c1[] = {1.4880454274285384E-09, 3.1146031777409673E-06, 3.1470309742465694E-04, 7.1215977556942766E-03, 5.6335374470954679E-02, 1.8245542837228418E-01, 2.2739494478010200E-01, -4.2425842671825266E-17, -2.2739494478010208E-01, -1.8245542837228432E-01, -5.6335374470954783E-02, -7.1215977556942861E-03, -3.1470309742465694E-04, -3.1146031777409668E-06, -1.4880454274285366E-09}; - constexpr FLT c2[] = {4.0364738474324423E-09, 4.4152383936309442E-06, 2.9537757977456596E-04, 4.5415629108243238E-03, 2.2685962261788550E-02, 3.3502333548319392E-02, -2.2696322242195994E-02, -7.6666666666667133E-02, -2.2696322242195945E-02, 3.3502333548319260E-02, 2.2685962261788570E-02, 4.5415629108243273E-03, 2.9537757977456591E-04, 4.4152383936309416E-06, 4.0364738474324407E-09}; - constexpr FLT c3[] = {6.6006259688120961E-09, 3.8297656275654657E-06, 1.6597029248061439E-04, 1.6248331197066942E-03, 4.0281119347581979E-03, -2.8399908290139206E-03, -1.3756562885831705E-02, 1.0758125681708418E-16, 1.3756562885831904E-02, 2.8399908290139895E-03, -4.0281119347581771E-03, -1.6248331197066914E-03, -1.6597029248061437E-04, -3.8297656275654657E-06, -6.6006259688120969E-09}; - constexpr FLT c4[] = {7.2920076887968825E-09, 2.2644150332986910E-06, 6.1226481435400985E-05, 3.3216368068303816E-04, 4.2258807580024870E-07, -1.7026747228854500E-03, -1.2026158633582243E-04, 2.8537037037044089E-03, -1.2026158633584264E-04, -1.7026747228853732E-03, 4.2258807580182180E-07, 3.3216368068303642E-04, 6.1226481435401053E-05, 2.2644150332986919E-06, 7.2920076887968842E-09}; - constexpr FLT c5[] = {5.7777535593445574E-09, 9.5996306286140537E-07, 1.5097159537535560E-05, 2.8094504791464212E-05, -1.2791075475386364E-04, -1.0516749004210079E-04, 4.0040320377530828E-04, 5.4844446833709888E-17, -4.0040320377525385E-04, 1.0516749004229523E-04, 1.2791075475386559E-04, -2.8094504791467126E-05, -1.5097159537535560E-05, -9.5996306286140579E-07, -5.7777535593445582E-09}; - constexpr FLT c6[] = {3.3986627004323950E-09, 2.9741452947022275E-07, 2.3232144780590118E-06, -3.5941523174497321E-06, -1.8171775676701533E-05, 3.2858338560981214E-05, 2.0665249075258455E-05, -6.8763374485615104E-05, 2.0665249075221676E-05, 3.2858338560934424E-05, -1.8171775676683576E-05, -3.5941523174470280E-06, 2.3232144780590435E-06, 2.9741452947022206E-07, 3.3986627004323950E-09}; - constexpr FLT c7[] = {1.5128957992049987E-09, 6.6672685257784247E-08, 1.4160936684823307E-07, -1.2611166225385906E-06, 6.6865545481897967E-07, 4.6861078169740899E-06, -7.4523870622442393E-06, 5.1688954219266444E-17, 7.4523870623463821E-06, -4.6861078171739939E-06, -6.6865545481690963E-07, 1.2611166225370325E-06, -1.4160936684824530E-07, -6.6672685257784551E-08, -1.5128957992049987E-09}; - constexpr FLT c8[] = {5.1310324414219292E-10, 1.0163871982745590E-08, -2.4441175134592830E-08, -1.0543632600171378E-07, 4.0979777876715675E-07, -2.9573937051194202E-07, -5.9824625884543558E-07, 1.2067769776847866E-06, -5.9824625879665336E-07, -2.9573937049659643E-07, 4.0979777875267863E-07, -1.0543632599876183E-07, -2.4441175134530762E-08, 1.0163871982746284E-08, 5.1310324414219364E-10}; - constexpr FLT c9[] = {1.3160883866734095E-10, 8.0584478671564817E-10, -6.7824252838686685E-09, 9.4471403089230076E-09, 2.4030590211824177E-08, -9.0522548480936782E-08, 9.9320303339648267E-08, 1.4827374781995408E-17, -9.9320303311968964E-08, 9.0522548602725694E-08, -2.4030590184836860E-08, -9.4471403124694187E-09, 6.7824252839146209E-09, -8.0584478671585931E-10, -1.3160883866734196E-10}; - constexpr FLT c10[] = {2.4734066313995269E-11, -4.3978001545632529E-11, -5.4975091406435660E-10, 2.6307942070348926E-09, -4.2001676281559915E-09, -1.8212709350780177E-10, 1.0547608795803518E-08, -1.6454374555673015E-08, 1.0547608746152108E-08, -1.8212708345187657E-10, -4.2001676312984721E-09, 2.6307942087632753E-09, -5.4975091402508072E-10, -4.3978001545363347E-11, 2.4734066313995970E-11}; - constexpr FLT c11[] = {3.0917581107111067E-12, -2.1504981481527399E-11, 3.4611945838654282E-11, 1.1082666500276105E-10, -5.8883840899000033E-10, 1.1304779661881485E-09, -1.0037911406820197E-09, -5.7884986037117854E-17, 1.0037911398302301E-09, -1.1304781086488634E-09, 5.8883842723235649E-10, -1.1082666592552764E-10, -3.4611945887454015E-11, 2.1504981480972878E-11, -3.0917581107111891E-12}; - constexpr FLT c12[] = {1.5997634038655269E-13, -2.4807970173617968E-12, 1.1275106610326804E-11, -2.3847055813595321E-11, 1.5364454138408298E-11, 4.4350534757580891E-11, -1.3563510404683277E-10, 1.8159081432580251E-10, -1.3563508771311925E-10, 4.4350484735577755E-11, 1.5364420705333068E-11, -2.3847054665131313E-11, 1.1275106670142851E-11, -2.4807970168633410E-12, 1.5997634038739785E-13}; - constexpr FLT c13[] = {-2.4800914618527656E-14, -2.0428592368367617E-14, 6.6720756177865110E-13, -2.9781122281459938E-12, 7.0947566948544657E-12, -1.0181675867287212E-11, 7.9189142537208719E-12, -1.4497056804736912E-17, -7.9189459915777383E-12, 1.0181666345930152E-11, -7.0947487603902491E-12, 2.9781098973971301E-12, -6.6720754938105074E-13, 2.0428592180708626E-14, 2.4800914617770965E-14}; - constexpr FLT c14[] = {-6.3774103672726629E-15, 2.8974955370030088E-14, -6.8422346755457550E-14, 5.3399811794037740E-14, 1.7893441503609519E-13, -7.2418549150581294E-13, 1.3713697997539906E-12, -1.6687145216540105E-12, 1.3713520998316439E-12, -7.2416872315832831E-13, 1.7893006768675052E-13, 5.3400626922038687E-14, -6.8422339477528482E-14, 2.8974955559559462E-14, -6.3774103666804019E-15}; - constexpr FLT c15[] = {-5.1635500202709335E-16, 3.1828105471276549E-15, -1.2111383721117860E-14, 3.1272734620510859E-14, -5.6176935449952714E-14, 6.8640388687474512E-14, -4.9039125333789703E-14, -3.5058680377244798E-17, 4.9029469776856299E-14, -6.8666790600965935E-14, 5.6189548021197700E-14, -3.1272749707318549E-14, 1.2111366748459164E-14, -3.1828106649933298E-15, 5.1635500199831522E-16}; - constexpr FLT c16[] = {4.5179133600663468E-18, -1.3721818586136237E-17, -2.0190809683029299E-16, 1.1787611877454253E-15, -3.5963787346199218E-15, 7.4622525856292898E-15, -1.1451676136812928E-14, 1.2941737777564503E-14, -1.1457648327763603E-14, 7.4174611535501039E-15, -3.6182145577673462E-15, 1.1783995902489914E-15, -2.0188185185104562E-16, -1.3721704675617759E-17, 4.5179136270619547E-18}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {3.7973138383475505E-11, 2.1620729770457867E-07, 4.2059935922517660E-05, 1.7055631615451750E-03, 2.4507833223051390E-02, 1.5833750021928361E-01, 5.2065761855025572E-01, 9.3058177132107800E-01, 9.3058177132107822E-01, 5.2065761855025583E-01, 1.5833750021928361E-01, 2.4507833223051407E-02, 1.7055631615451757E-03, 4.2059935922517680E-05, 2.1620729770457854E-07, 3.7973138383475363E-11}; - constexpr FLT c1[] = {2.3529614069937368E-10, 6.9307767643753084E-07, 9.1584555859393273E-05, 2.6688190455647263E-03, 2.7424935799146805E-02, 1.1980519064171602E-01, 2.2858769149343988E-01, 1.3403316930972969E-01, -1.3403316930972969E-01, -2.2858769149343988E-01, -1.1980519064171603E-01, -2.7424935799146809E-02, -2.6688190455647263E-03, -9.1584555859393273E-05, -6.9307767643753063E-07, -2.3529614069937291E-10}; - constexpr FLT c2[] = {6.6422278409342484E-10, 1.0324321112746625E-06, 9.1817488865684769E-05, 1.8711533829047168E-03, 1.2921996060610234E-02, 3.2047854205940321E-02, 1.0693035516337747E-02, -5.7626889750985358E-02, -5.7626889750985420E-02, 1.0693035516337622E-02, 3.2047854205940300E-02, 1.2921996060610227E-02, 1.8711533829047159E-03, 9.1817488865684728E-05, 1.0324321112746625E-06, 6.6422278409342453E-10}; - constexpr FLT c3[] = {1.1357078950958115E-09, 9.4728532805183455E-07, 5.5827161828283907E-05, 7.6087086075588353E-04, 3.0946204357507638E-03, 1.6729582927767952E-03, -9.5127691406672668E-03, -8.9630953638633881E-03, 8.9630953638635737E-03, 9.5127691406674039E-03, -1.6729582927767412E-03, -3.0946204357507521E-03, -7.6087086075588267E-04, -5.5827161828283886E-05, -9.4728532805183402E-07, -1.1357078950958119E-09}; - constexpr FLT c4[] = {1.3190161602522571E-09, 5.9764321317063336E-07, 2.2744388605472980E-05, 1.9073517322668089E-04, 2.8943142766413201E-04, -8.8625893129445465E-04, -1.3389167739520302E-03, 1.7216657535080475E-03, 1.7216657535079566E-03, -1.3389167739519974E-03, -8.8625893129445302E-04, 2.8943142766413342E-04, 1.9073517322668089E-04, 2.2744388605472997E-05, 5.9764321317063368E-07, 1.3190161602522571E-09}; - constexpr FLT c5[] = {1.1057322032863292E-09, 2.7364351668058875E-07, 6.4277990516969732E-06, 2.7144256967440253E-05, -3.6927862875708149E-05, -1.6756539822663250E-04, 1.6190404775924360E-04, 2.9203183363577429E-04, -2.9203183363574707E-04, -1.6190404775915027E-04, 1.6756539822663250E-04, 3.6927862875712038E-05, -2.7144256967440009E-05, -6.4277990516969918E-06, -2.7364351668058875E-07, -1.1057322032863296E-09}; - constexpr FLT c6[] = {6.9354916180818945E-10, 9.3269475195063855E-08, 1.2384428187212403E-06, 8.4996778392803041E-07, -1.3106613626284104E-05, 2.8218026704026646E-06, 4.1119875273776001E-05, -3.3017437945353985E-05, -3.3017437945415066E-05, 4.1119875273714446E-05, 2.8218026703990287E-06, -1.3106613626289508E-05, 8.4996778392747454E-07, 1.2384428187212240E-06, 9.3269475195063643E-08, 6.9354916180818914E-10}; - constexpr FLT c7[] = {3.3254260763956042E-10, 2.3748169129617104E-08, 1.4324995919586480E-07, -4.5855119979446571E-07, -9.5896649524100645E-07, 3.6155491755001142E-06, -9.8206137491315186E-07, -6.1812989819835450E-06, 6.1812989820611756E-06, 9.8206137497544330E-07, -3.6155491754721922E-06, 9.5896649524660746E-07, 4.5855119979503682E-07, -1.4324995919584492E-07, -2.3748169129616922E-08, -3.3254260763956068E-10}; - constexpr FLT c8[] = {1.2320735888479529E-10, 4.4066719437554910E-09, 2.9936173156462927E-09, -8.7082338359679101E-08, 1.2972939456291547E-07, 2.2882425903046301E-07, -7.3491924909334631E-07, 4.5592445674903059E-07, 4.5592445658978770E-07, -7.3491924903833956E-07, 2.2882425902441689E-07, 1.2972939456293178E-07, -8.7082338359266715E-08, 2.9936173156449473E-09, 4.4066719437557416E-09, 1.2320735888479524E-10}; - constexpr FLT c9[] = {3.5284250010876628E-11, 5.4380355945640250E-10, -2.1550460241694361E-09, -3.7344953348928088E-09, 2.7722604311846508E-08, -3.9597167021230792E-08, -1.3993916628542531E-08, 9.5626629210101709E-08, -9.5626629290371673E-08, 1.3993916670061478E-08, 3.9597167019846826E-08, -2.7722604310808535E-08, 3.7344953348928088E-09, 2.1550460241924123E-09, -5.4380355945618072E-10, -3.5284250010876789E-11}; - constexpr FLT c10[] = {7.7013760205813290E-12, 2.8123297626332877E-11, -3.7953802132437611E-10, 8.7573780453214681E-10, 5.1359846908750478E-10, -5.3609157480923598E-09, 9.1303305149265196E-09, -4.8150450778386211E-09, -4.8150450602405480E-09, 9.1303305006281353E-09, -5.3609157342653948E-09, 5.1359846657352753E-10, 8.7573780480711250E-10, -3.7953802133297068E-10, 2.8123297626237416E-11, 7.7013760205811319E-12}; - constexpr FLT c11[] = {1.2276300481459368E-12, -4.1769601372671798E-12, -1.9148402800715177E-11, 1.3822953630779855E-10, -3.0994364017547768E-10, 2.0316700893505159E-10, 4.3650568116859601E-10, -1.1534087567294806E-09, 1.1534086455717957E-09, -4.3650568244627625E-10, -2.0316701046115955E-10, 3.0994364003351358E-10, -1.3822953650299937E-10, 1.9148402794060861E-11, 4.1769601372325045E-12, -1.2276300481460517E-12}; - constexpr FLT c12[] = {1.2527329159215257E-13, -1.0816725479918068E-12, 2.7445378707133412E-12, 1.7839886378835549E-12, -2.6194655703148228E-11, 6.7446666417949068E-11, -8.5082142817277568E-11, 4.0255080062661886E-11, 4.0254965726647763E-11, -8.5082126483561454E-11, 6.7446671522236455E-11, -2.6194657362041918E-11, 1.7839889409505645E-12, 2.7445378607441180E-12, -1.0816725479139360E-12, 1.2527329159224173E-13}; - constexpr FLT c13[] = {3.2506946752710786E-15, -9.2845381849289691E-14, 5.1542691616877330E-13, -1.3678932005895992E-12, 1.6503397946393055E-12, 7.2548932254614457E-13, -6.2314806405069215E-12, 1.1299375277421538E-11, -1.1299433992456742E-11, 6.2314647715784883E-12, -7.2550201768889120E-13, -1.6503403897241219E-12, 1.3678930766135958E-12, -5.1542690377117294E-13, 9.2845381940092428E-14, -3.2506946753893115E-15}; - constexpr FLT c14[] = {-1.3523251101878356E-15, 1.9055798839533079E-15, 1.8430813184053169E-14, -1.1526987096958319E-13, 3.3349122385594633E-13, -5.8352048227061829E-13, 6.1751861733538967E-13, -2.7104853725824153E-13, -2.7103052681092733E-13, 6.1751644366071028E-13, -5.8351023494715043E-13, 3.3348982649365648E-13, -1.1526961866805939E-13, 1.8430809545089241E-14, 1.9055798650003023E-15, -1.3523251102248507E-15}; - constexpr FLT c15[] = {-2.4132931360656334E-16, 1.2442654599774185E-15, -3.5592598733275504E-15, 5.0956447378324209E-15, 1.6446732556150498E-15, -2.5290498540837812E-14, 6.2712721591286338E-14, -9.2666673089509217E-14, 9.2581824882952367E-14, -6.2712118118977746E-14, 2.5288160085642670E-14, -1.6451258598462044E-15, -5.0958559531403920E-15, 3.5592532728491847E-15, -1.2442654894438389E-15, 2.4132931361645452E-16}; - constexpr FLT c16[] = {-1.6052119916687038E-17, 1.0220930228231101E-16, -4.3668420339021406E-16, 1.2658361982998821E-15, -2.5907177687935505E-15, 3.7311262928168221E-15, -3.4997038937045781E-15, 1.4124231584693148E-15, 1.3706178218468559E-15, -3.5056760846448971E-15, 3.7363519598930578E-15, -2.5923974474980012E-15, 1.2658945204780770E-15, -4.3668985335150679E-16, 1.0220927950027870E-16, -1.6052119872193216E-17}; - constexpr FLT c17[] = {1.2307507877258324E-18, -2.6518352923945508E-18, -1.0105982127470271E-20, 2.6958700270869167E-17, -1.1513299715471039E-16, 2.7882272296911513E-16, -4.6961519239790030E-16, 6.5796739812484873E-16, -6.7025909677113713E-16, 4.6238478142949540E-16, -2.8307058941305305E-16, 1.1494093936336214E-16, -2.6999653770494898E-17, 1.1474040843416029E-20, 2.6518435669432360E-18, -1.2307508200482882E-18}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc index 358a1bdbf..e2fa229b7 100644 --- a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc +++ b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc @@ -2,191 +2,170 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. if (w==2) { - constexpr FLT c0[] = {2.3711015472112535E+01, 2.3711015472112539E+01}; - constexpr FLT c1[] = {2.5079742199350566E+01, -2.5079742199350566E+01}; - constexpr FLT c2[] = {-3.5023281580177019E+00, -3.5023281580177028E+00}; - constexpr FLT c3[] = {-7.3894949249195596E+00, 7.3894949249195649E+00}; + constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; + constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; + constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; + constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); } else if (w==3) { - constexpr FLT c0[] = {5.9620016143346866E+01, 2.4110216701187517E+02, 5.9620016148621886E+01}; - constexpr FLT c1[] = {9.7575520958604287E+01, 6.0625609804989280E-15, -9.7575520952908548E+01}; - constexpr FLT c2[] = {3.5838417859768519E+01, -7.3472145274965385E+01, 3.5838417865129472E+01}; - constexpr FLT c3[] = {-1.0721643298166459E+01, 2.2269719700859066E-14, 1.0721643303220411E+01}; - constexpr FLT c4[] = {-7.0570630207138105E+00, 9.1538553399011651E+00, -7.0570630151506615E+00}; + constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; + constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; + constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; + constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; + constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); } else if (w==4) { - constexpr FLT c0[] = {1.2612470018753703E+02, 1.1896204292999123E+03, 1.1896204292999125E+03, 1.2612470018753706E+02}; - constexpr FLT c1[] = {2.6158034850676631E+02, 5.6161104654809833E+02, -5.6161104654809833E+02, -2.6158034850676631E+02}; - constexpr FLT c2[] = {1.7145379463699527E+02, -1.6695967127766502E+02, -1.6695967127766531E+02, 1.7145379463699518E+02}; - constexpr FLT c3[] = {2.3525961965887934E+01, -1.0057439659768855E+02, 1.0057439659768869E+02, -2.3525961965887870E+01}; - constexpr FLT c4[] = {-1.5608307370340814E+01, 9.5627412100261218E+00, 9.5627412100261768E+00, -1.5608307370340912E+01}; - constexpr FLT c5[] = {-4.5715207776748672E+00, 7.9904373067896399E+00, -7.9904373067894170E+00, 4.5715207776748832E+00}; + constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; + constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; + constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; + constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; + constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; + constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==5) { - constexpr FLT c0[] = {2.4106943677442635E+02, 4.3538384278025578E+03, 9.3397486707382068E+03, 4.3538384278025542E+03, 2.4106943677442635E+02}; - constexpr FLT c1[] = {5.8781364250328284E+02, 3.4742855804122032E+03, -2.2247045611533172E-13, -3.4742855804122019E+03, -5.8781364250328272E+02}; - constexpr FLT c2[] = {5.1234107167555874E+02, 3.5219546517037230E+02, -1.7076861141633149E+03, 3.5219546517037259E+02, 5.1234107167555862E+02}; - constexpr FLT c3[] = {1.7540956907856085E+02, -3.5792356187777011E+02, 1.0950032210404113E-12, 3.5792356187777193E+02, -1.7540956907856062E+02}; - constexpr FLT c4[] = {-2.1768066955080412E-01, -7.8322173187697160E+01, 1.3904039464934533E+02, -7.8322173187696521E+01, -2.1768066955089899E-01}; - constexpr FLT c5[] = {-1.4207955403641282E+01, 1.6019466986222039E+01, 6.2864597222035853E-14, -1.6019466986221275E+01, 1.4207955403641282E+01}; - constexpr FLT c6[] = {-2.1966493586752702E+00, 5.0672636163198259E+00, -6.7340544905090631E+00, 5.0672636163192113E+00, -2.1966493586753031E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; + constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; + constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; + constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; + constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; + constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; + for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); } else if (w==6) { - constexpr FLT c0[] = {4.3011762559089192E+02, 1.3368828836127082E+04, 4.9861340433371268E+04, 4.9861340433371290E+04, 1.3368828836127082E+04, 4.3011762559835182E+02}; - constexpr FLT c1[] = {1.1857225840065146E+03, 1.4112553227730619E+04, 1.5410005180819442E+04, -1.5410005180819426E+04, -1.4112553227730617E+04, -1.1857225839984601E+03}; - constexpr FLT c2[] = {1.2460481448413077E+03, 4.3127030215084988E+03, -5.5438591621431215E+03, -5.5438591621431233E+03, 4.3127030215084969E+03, 1.2460481448488895E+03}; - constexpr FLT c3[] = {6.0825549344387821E+02, -3.4106010789546866E+02, -1.9775725023673151E+03, 1.9775725023673224E+03, 3.4106010789547190E+02, -6.0825549343673049E+02}; - constexpr FLT c4[] = {1.1264961069783713E+02, -3.9740822717990801E+02, 2.7557540616463564E+02, 2.7557540616463149E+02, -3.9740822717990505E+02, 1.1264961070570472E+02}; - constexpr FLT c5[] = {-1.5387906304333869E+01, -3.2640579296386335E+01, 1.1683718215647407E+02, -1.1683718215647050E+02, 3.2640579296386335E+01, 1.5387906311562686E+01}; - constexpr FLT c6[] = {-9.3947198873910107E+00, 1.5069930500884340E+01, -8.0900452409585597E+00, -8.0900452409573536E+00, 1.5069930500885983E+01, -9.3947198802582648E+00}; - constexpr FLT c7[] = {-5.6048841964528473E-01, 2.3377422080932533E+00, -4.2391567591829169E+00, 4.2391567591861783E+00, -2.3377422080911803E+00, 5.6048842664328347E-01}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; + constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; + constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; + constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; + constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; + constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; + constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; + for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); } else if (w==7) { - constexpr FLT c0[] = {7.2950392616203362E+02, 3.6439117038309523E+04, 2.1220891582018451E+05, 3.6180058567561547E+05, 2.1220891582018466E+05, 3.6439117038309538E+04, 7.2950392617434579E+02}; - constexpr FLT c1[] = {2.2197790785452585E+03, 4.6392067080426263E+04, 1.1568051746995676E+05, -2.6471374827810822E-11, -1.1568051746995673E+05, -4.6392067080426248E+04, -2.2197790785319785E+03}; - constexpr FLT c2[] = {2.6796845075663950E+03, 2.0921129984587253E+04, 3.9399551345633640E+01, -4.7251335435527413E+04, 3.9399551345568185E+01, 2.0921129984587242E+04, 2.6796845075789138E+03}; - constexpr FLT c3[] = {1.6253748990844513E+03, 2.6138488347211651E+03, -1.0037546705421486E+04, 4.9207207296884551E-11, 1.0037546705421528E+04, -2.6138488347211514E+03, -1.6253748990726617E+03}; - constexpr FLT c4[] = {4.9106375852553407E+02, -8.6668269315415375E+02, -1.0513434716617946E+03, 2.8444456471590820E+03, -1.0513434716617835E+03, -8.6668269315414682E+02, 4.9106375853851517E+02}; - constexpr FLT c5[] = {4.0739167949763470E+01, -2.8515155742293291E+02, 3.9930326803802245E+02, 9.3897520950192402E-12, -3.9930326803800614E+02, 2.8515155742293899E+02, -4.0739167937836122E+01}; - constexpr FLT c6[] = {-1.7148987139838134E+01, 7.5799002551925454E-01, 6.3260304953181709E+01, -1.0529869309159973E+02, 6.3260304953170241E+01, 7.5799002552861849E-01, -1.7148987128070043E+01}; - constexpr FLT c7[] = {-4.5424411501048008E+00, 9.8749254058339080E+00, -9.6456179777422530E+00, 1.4220101775868667E-11, 9.6456179778363111E+00, -9.8749254058241132E+00, 4.5424411616515830E+00}; - constexpr FLT c8[] = {-5.0793946806705008E-02, 7.3273813711596381E-01, -2.0117140545159620E+00, 2.6999257940738310E+00, -2.0117140545257630E+00, 7.3273813712090197E-01, -5.0793935652734865E-02}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; + constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; + constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; + constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; + constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; + constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; + constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; + constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; + for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==8) { - constexpr FLT c0[] = {1.1895823653767156E+03, 9.0980236725237002E+04, 7.7438826909537544E+05, 2.0077596413122714E+06, 2.0077596413122721E+06, 7.7438826909537590E+05, 9.0980236725237002E+04, 1.1895823653767152E+03}; - constexpr FLT c1[] = {3.9313191526977803E+03, 1.3318570706800825E+05, 5.7275848637687659E+05, 4.6250273225257988E+05, -4.6250273225258006E+05, -5.7275848637687659E+05, -1.3318570706800825E+05, -3.9313191526977798E+03}; - constexpr FLT c2[] = {5.2976026193612415E+03, 7.5628970871188474E+04, 1.0073339198368331E+05, -1.8165150843791279E+05, -1.8165150843791300E+05, 1.0073339198368324E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; - constexpr FLT c3[] = {3.7552239608473869E+03, 1.8376340228970930E+04, -2.3878081117551392E+04, -4.6296734056047753E+04, 4.6296734056048466E+04, 2.3878081117551716E+04, -1.8376340228970901E+04, -3.7552239608473869E+03}; - constexpr FLT c4[] = {1.4742862505418659E+03, 1.2842168112180084E+02, -9.1969665138397813E+03, 7.5990739935236888E+03, 7.5990739935236415E+03, -9.1969665138397813E+03, 1.2842168112182003E+02, 1.4742862505418657E+03}; - constexpr FLT c5[] = {2.8158981009344376E+02, -8.8613607108855138E+02, 5.3457145342334591E+01, 2.1750989694613118E+03, -2.1750989694611812E+03, -5.3457145342138865E+01, 8.8613607108855138E+02, -2.8158981009344376E+02}; - constexpr FLT c6[] = {-1.4786862436220549E+00, -1.3935442261829297E+02, 3.2599325739090762E+02, -1.9541889343354751E+02, -1.9541889343356968E+02, 3.2599325739086612E+02, -1.3935442261828183E+02, -1.4786862436238759E+00}; - constexpr FLT c7[] = {-1.1542034522900533E+01, 1.2000512051415985E+01, 1.9687328710253290E+01, -6.3962883082497100E+01, 6.3962883082831397E+01, -1.9687328710065113E+01, -1.2000512051397745E+01, 1.1542034522901620E+01}; - constexpr FLT c8[] = {-1.7448292513541994E+00, 4.8577330433876664E+00, -6.8794163043749101E+00, 3.4611708986529197E+00, 3.4611708984979552E+00, -6.8794163042722616E+00, 4.8577330434089125E+00, -1.7448292513539221E+00}; - constexpr FLT c9[] = {1.5044951479000782E-01, 9.6230159355094672E-02, -7.0399250408500635E-01, 1.3251401130885254E+00, -1.3251401130188682E+00, 7.0399250409661596E-01, -9.6230159344936325E-02, -1.5044951478914617E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; + constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; + constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; + constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; + constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; + constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; + constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; + constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); } else if (w==9) { - constexpr FLT c0[] = {1.8793738965777031E+03, 2.1220891582018440E+05, 2.5233246441351655E+06, 9.2877384983420707E+06, 1.4015330434461467E+07, 9.2877384983420800E+06, 2.5233246441351655E+06, 2.1220891582018536E+05, 1.8793738965777065E+03}; - constexpr FLT c1[] = {6.6675066501609354E+03, 3.4704155240987014E+05, 2.2890184838322564E+06, 3.8705035445351237E+06, 1.1717532248112299E-10, -3.8705035445351265E+06, -2.2890184838322559E+06, -3.4704155240987102E+05, -6.6675066501609354E+03}; - constexpr FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092282E+05, -2.1140963571671949E+05, -1.4236515118873832E+06, -2.1140963571672430E+05, 6.8167589492092212E+05, 2.3171563090202416E+05, 9.8412775404612275E+03}; - constexpr FLT c3[] = {7.8762358364031061E+03, 7.6500585979636191E+04, 1.2434778984075345E+04, -2.8572091469429957E+05, 1.1900185890455270E-09, 2.8572091469430370E+05, -1.2434778984074723E+04, -7.6500585979636191E+04, -7.8762358364031033E+03}; - constexpr FLT c4[] = {3.6941911906762075E+03, 9.9232929169976032E+03, -3.3472877669901907E+04, -1.4082384858050133E+04, 6.7911966136974472E+04, -1.4082384858045889E+04, -3.3472877669901856E+04, 9.9232929169977433E+03, 3.6941911906762098E+03}; - constexpr FLT c5[] = {9.8900189723050323E+02, -1.2736589324621348E+03, -5.0407308390125609E+03, 9.8914296140178049E+03, 6.1223023135982708E-10, -9.8914296140230235E+03, 5.0407308390128219E+03, 1.2736589324621673E+03, -9.8900189723050403E+02}; - constexpr FLT c6[] = {1.1165868717716108E+02, -5.9057035448559543E+02, 5.5860705835625356E+02, 9.1996097522935008E+02, -2.0290255886368843E+03, 9.1996097522906575E+02, 5.5860705835607132E+02, -5.9057035448565603E+02, 1.1165868717715755E+02}; - constexpr FLT c7[] = {-1.3142584300867490E+01, -4.2852762793261455E+01, 1.8188640945803897E+02, -2.1362000457586478E+02, 1.1194928851903786E-10, 2.1362000457739751E+02, -1.8188640945787162E+02, 4.2852762793424958E+01, 1.3142584300868396E+01}; - constexpr FLT c8[] = {-5.8088068374876212E+00, 1.0201832931297655E+01, -3.5220973552653217E-01, -2.6632420897260161E+01, 4.2737607183076172E+01, -2.6632420895005694E+01, -3.5220973526763744E-01, 1.0201832931314263E+01, -5.8088068374874551E+00}; - constexpr FLT c9[] = {-4.0642645973149144E-01, 1.8389772328590479E+00, -3.5549484956004700E+00, 3.2273562224626624E+00, 2.3066481718890602E-10, -3.2273562263634674E+00, 3.5549484956933464E+00, -1.8389772328126097E+00, 4.0642645973247782E-01}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; + constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; + constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; + constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; + constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; + constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; + constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; + constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; + constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; + for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); } else if (w==10) { - constexpr FLT c0[] = {2.8923571298063644E+03, 4.6856831608341972E+05, 7.5304732752870098E+06, 3.7576537584215805E+07, 7.9591606307847947E+07, 7.9591606307847947E+07, 3.7576537584215775E+07, 7.5304732752870088E+06, 4.6856831608341815E+05, 2.8923571298063584E+03}; - constexpr FLT c1[] = {1.0919387804943195E+04, 8.3976685277206486E+05, 7.9494027659552386E+06, 2.1606786285174560E+07, 1.4625897641453253E+07, -1.4625897641453268E+07, -2.1606786285174556E+07, -7.9494027659552386E+06, -8.3976685277206241E+05, -1.0919387804943173E+04}; - constexpr FLT c2[] = {1.7418455635504146E+04, 6.3489952164419868E+05, 3.1358985409389907E+06, 2.2547438801903715E+06, -6.0429762783920690E+06, -6.0429762783920504E+06, 2.2547438801903636E+06, 3.1358985409389869E+06, 6.3489952164419682E+05, 1.7418455635504106E+04}; - constexpr FLT c3[] = {1.5396188098732166E+04, 2.5490607173283477E+05, 4.2818880748176732E+05, -9.5435463094349112E+05, -1.2004850139039194E+06, 1.2004850139039543E+06, 9.5435463094349764E+05, -4.2818880748176464E+05, -2.5490607173283392E+05, -1.5396188098732144E+04}; - constexpr FLT c4[] = {8.2616700456447434E+03, 5.2880641964112423E+04, -6.1165055141129313E+04, -2.1590299490710214E+05, 2.1595822052158226E+05, 2.1595822052158433E+05, -2.1590299490713206E+05, -6.1165055141130644E+04, 5.2880641964112234E+04, 8.2616700456447343E+03}; - constexpr FLT c5[] = {2.7267169079066489E+03, 2.4572549134030178E+03, -2.6065821571076271E+04, 1.3919259807562572E+04, 4.6802084705703302E+04, -4.6802084705714791E+04, -1.3919259807544826E+04, 2.6065821571078101E+04, -2.4572549134029523E+03, -2.7267169079066462E+03}; - constexpr FLT c6[] = {5.0402062537834655E+02, -1.3640153425625094E+03, -1.4063198459010243E+03, 7.0858129627832977E+03, -4.8375233777539070E+03, -4.8375233777688618E+03, 7.0858129627894568E+03, -1.4063198459013925E+03, -1.3640153425628407E+03, 5.0402062537833399E+02}; - constexpr FLT c7[] = {2.4199726682552246E+01, -2.8393731159230907E+02, 5.1652001352658374E+02, 7.4578914842690025E+01, -1.1556759026394043E+03, 1.1556759026669868E+03, -7.4578914836335017E+01, -5.1652001352477316E+02, 2.8393731159271266E+02, -2.4199726682540764E+01}; - constexpr FLT c8[] = {-1.0545675122358718E+01, -3.0306758891736707E+00, 7.2305523762002423E+01, -1.3808908570315674E+02, 7.6293213390392353E+01, 7.6293213419941608E+01, -1.3808908572000124E+02, 7.2305523762424571E+01, -3.0306758892308885E+00, -1.0545675122367939E+01}; - constexpr FLT c9[] = {-2.1836930570445361E+00, 5.4992367507340179E+00, -4.5624617242018264E+00, -6.6492709812433128E+00, 2.0339240340948546E+01, -2.0339240355994509E+01, 6.6492709998185751E+00, 4.5624617253163429E+00, -5.4992367508385041E+00, 2.1836930570532433E+00}; - constexpr FLT c10[] = {-9.1748741454156318E-02, 5.2562451749078731E-01, -1.4144257942386596E+00, 1.8629579002072614E+00, -9.0169873685258095E-01, -9.0169875903814667E-01, 1.8629579050577161E+00, -1.4144257935638165E+00, 5.2562451754351402E-01, -9.1748741461736935E-02}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; + constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; + constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; + constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; + constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; + constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; + constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; + constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; + constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; + constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; + for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==11) { - constexpr FLT c0[] = {4.3537972057094375E+03, 9.8872306817881158E+05, 2.0938056062983297E+07, 1.3701428307175839E+08, 3.8828289972017384E+08, 5.4292197128519225E+08, 3.8828289972017366E+08, 1.3701428307175839E+08, 2.0938056062983308E+07, 9.8872306817881158E+05, 4.3537972057093921E+03}; - constexpr FLT c1[] = {1.7371472778611500E+04, 1.9155790709433779E+06, 2.4914432724618737E+07, 9.7792160665338382E+07, 1.3126779387874995E+08, -1.1645321713027108E-08, -1.3126779387875001E+08, -9.7792160665338382E+07, -2.4914432724618725E+07, -1.9155790709433777E+06, -1.7371472778611380E+04}; - constexpr FLT c2[] = {2.9650558537745463E+04, 1.6014973065836846E+06, 1.1867448782239098E+07, 2.0812212822540630E+07, -1.1749875870571045E+07, -4.5121922350041404E+07, -1.1749875870570999E+07, 2.0812212822540656E+07, 1.1867448782239093E+07, 1.6014973065836844E+06, 2.9650558537745292E+04}; - constexpr FLT c3[] = {2.8505604980264405E+04, 7.4166660874053370E+05, 2.5711466441825363E+06, -1.2146931938153724E+06, -8.3931576510115806E+06, 5.8947555067017928E-08, 8.3931576510117110E+06, 1.2146931938154269E+06, -2.5711466441825293E+06, -7.4166660874053300E+05, -2.8505604980264299E+04}; - constexpr FLT c4[] = {1.7045632829988484E+04, 1.9785834209758099E+05, 8.6361403553703407E+04, -1.0584472412325807E+06, -1.3367486018954750E+05, 1.7818009619468113E+06, -1.3367486018952320E+05, -1.0584472412325810E+06, 8.6361403553705750E+04, 1.9785834209758116E+05, 1.7045632829988426E+04}; - constexpr FLT c5[] = {6.5462464716912891E+03, 2.5347576368078731E+04, -7.5810878908802741E+04, -8.0774039751698409E+04, 2.5492801112953416E+05, 3.1373949311406158E-08, -2.5492801112952997E+05, 8.0774039751677527E+04, 7.5810878908807950E+04, -2.5347576368078797E+04, -6.5462464716912691E+03}; - constexpr FLT c6[] = {1.5684149291082226E+03, -1.0302687059850266E+03, -1.3446845770824604E+04, 2.0814393480318489E+04, 1.4366994276506950E+04, -4.4581342385966971E+04, 1.4366994276487216E+04, 2.0814393480327166E+04, -1.3446845770825106E+04, -1.0302687059851414E+03, 1.5684149291082156E+03}; - constexpr FLT c7[] = {1.9398419323286674E+02, -8.7329293867233980E+02, 2.4796533428845552E+02, 3.2905701326708659E+03, -4.8989871768521243E+03, 2.5910474731743909E-08, 4.8989871768931434E+03, -3.2905701326280059E+03, -2.4796533428623073E+02, 8.7329293867272952E+02, -1.9398419323288715E+02}; - constexpr FLT c8[] = {-4.2288232505094108E+00, -9.9574929618070513E+01, 2.9563077145679659E+02, -1.9453049353627330E+02, -4.0107401575324394E+02, 7.9532514191794951E+02, -4.0107401576649818E+02, -1.9453049352309569E+02, 2.9563077145970482E+02, -9.9574929617658114E+01, -4.2288232504962613E+00}; - constexpr FLT c9[] = {-5.3741131162116726E+00, 5.5350606001924518E+00, 1.9153744596147146E+01, -6.3189447496716646E+01, 6.6921287671707859E+01, -1.3450045688823196E-08, -6.6921287609294978E+01, 6.3189447455108059E+01, -1.9153744593546609E+01, -5.5350606002853286E+00, 5.3741131162113103E+00}; - constexpr FLT c10[] = {-7.0359426507051681E-01, 2.2229112760631806E+00, -3.2054079730741187E+00, 8.3392535011476268E-02, 6.8879260445103929E+00, -1.0795498350223303E+01, 6.8879260559828390E+00, 8.3392524213879743E-02, -3.2054079670004838E+00, 2.2229112761686296E+00, -7.0359426507381639E-01}; - constexpr FLT c11[] = {5.2648094862911970E-02, 9.9912561370710071E-02, -4.3913938793989010E-01, 7.9792986880755179E-01, -6.9191820607752896E-01, -3.1086723020887482E-08, 6.9191819251103082E-01, -7.9792986253876474E-01, 4.3913938485313375E-01, -9.9912561580306161E-02, -5.2648094876606648E-02}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; + constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; + constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; + constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; + constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; + constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; + constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; + constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; + constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; + constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; + for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); } else if (w==12) { - constexpr FLT c0[] = {6.4299692685485479E+03, 2.0077596413122746E+06, 5.4904521978991687E+07, 4.5946106674819386E+08, 1.6835469840840111E+09, 3.1308386544851584E+09, 3.1308386544851594E+09, 1.6835469840840116E+09, 4.5946106674819499E+08, 5.4904521978991836E+07, 2.0077596413122742E+06, 6.4299692685634491E+03}; - constexpr FLT c1[] = {2.6965848540274084E+04, 4.1625245902732192E+06, 7.2097002594596982E+07, 3.8505085985474664E+08, 7.9479013671674263E+08, 4.7870231281824070E+08, -4.7870231281824070E+08, -7.9479013671674287E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732182E+06, -2.6965848540258085E+04}; - constexpr FLT c2[] = {4.8869694409905118E+04, 3.7863371066322499E+06, 3.9530526716552719E+07, 1.1475134266581047E+08, 4.6311261797931008E+07, -2.0442837194260687E+08, -2.0442837194260764E+08, 4.6311261797930703E+07, 1.1475134266581020E+08, 3.9530526716552772E+07, 3.7863371066322499E+06, 4.8869694409920470E+04}; - constexpr FLT c3[] = {5.0530564260114013E+04, 1.9615784087727305E+06, 1.1044597342441026E+07, 7.9812418612436997E+06, -3.4042228324588403E+07, -3.3301805987927672E+07, 3.3301805987928241E+07, 3.4042228324588865E+07, -7.9812418612435153E+06, -1.1044597342440989E+07, -1.9615784087727298E+06, -5.0530564260099913E+04}; - constexpr FLT c4[] = {3.3081876469965486E+04, 6.2011956881368393E+05, 1.3086001239863783E+06, -3.1165484297367223E+06, -5.1982996003441429E+06, 6.3530947749620415E+06, 6.3530947749622557E+06, -5.1982996003440823E+06, -3.1165484297365877E+06, 1.3086001239863841E+06, 6.2011956881368428E+05, 3.3081876469981347E+04}; - constexpr FLT c5[] = {1.4308966168506786E+04, 1.1375573205951968E+05, -1.0318195403423737E+05, -6.6892418721464148E+05, 5.9223570255464804E+05, 1.1093685152670993E+06, -1.1093685152665814E+06, -5.9223570255454781E+05, 6.6892418721485860E+05, 1.0318195403423111E+05, -1.1375573205951942E+05, -1.4308966168492359E+04}; - constexpr FLT c6[] = {4.0848961919701046E+03, 7.5033277163530902E+03, -5.2578904182708357E+04, 6.3431596330007251E+03, 1.5984798504282974E+05, -1.2521363434086266E+05, -1.2521363434064612E+05, 1.5984798504277965E+05, 6.3431596327688303E+03, -5.2578904182719976E+04, 7.5033277163531166E+03, 4.0848961919843532E+03}; - constexpr FLT c7[] = {7.1658797373677851E+02, -1.5499947984091114E+03, -4.5490740453145772E+03, 1.4520122796449663E+04, -3.7896465827621914E+03, -2.3597107892496744E+04, 2.3597107892730306E+04, 3.7896465829102508E+03, -1.4520122796250829E+04, 4.5490740453377412E+03, 1.5499947984094479E+03, -7.1658797372277252E+02}; - constexpr FLT c8[] = {5.2022749592536726E+01, -4.0624258132612465E+02, 5.2256582979411519E+02, 9.3282469962228390E+02, -2.8710622268636553E+03, 1.7594166900407929E+03, 1.7594166904608542E+03, -2.8710622266536416E+03, 9.3282469976057041E+02, 5.2256582978430436E+02, -4.0624258132566132E+02, 5.2022749606076808E+01}; - constexpr FLT c9[] = {-7.0341875498933257E+00, -2.3043166228613529E+01, 1.2279331781902621E+02, -1.6714687552668008E+02, -4.4746498567249184E+01, 3.6060905998808425E+02, -3.6060905975626497E+02, 4.4746498638578188E+01, 1.6714687551479193E+02, -1.2279331779450688E+02, 2.3043166229077912E+01, 7.0341875614883520E+00}; - constexpr FLT c10[] = {-2.1556100132578342E+00, 4.1361104015055048E+00, 1.8107701824759481E+00, -2.1223400283067541E+01, 3.5820961921268712E+01, -1.8782945757357222E+01, -1.8782945295761856E+01, 3.5820961970532480E+01, -2.1223400227730256E+01, 1.8107701446846367E+00, 4.1361104022646886E+00, -2.1556100021360516E+00}; - constexpr FLT c11[] = {-1.1440899376747989E-01, 7.0567641591059616E-01, -1.4530217944402339E+00, 1.0571984630250064E+00, 1.4389000408734942E+00, -4.2241734506571262E+00, 4.2241732732256922E+00, -1.4389001658681779E+00, -1.0571984849752754E+00, 1.4530218273656557E+00, -7.0567641625357191E-01, 1.1440900438178589E-01}; - constexpr FLT c12[] = {-1.4486009664532199E-02, 2.9387825785133236E-03, -1.0265970208873806E-01, 2.6748270027876714E-01, -3.3606433030575705E-01, 1.5850134054436241E-01, 1.5850148084990595E-01, -3.3606430399846576E-01, 2.6748282743067825E-01, -1.0265974511212309E-01, 2.9387825100049524E-03, -1.4486000362352570E-02}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; + constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; + constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; + constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; + constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; + constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; + constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; + constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; + constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; + constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; + constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); } else if (w==13) { - constexpr FLT c0[] = {9.3397060605267925E+03, 3.9447202186643188E+06, 1.3701428307175836E+08, 1.4375660883001420E+09, 6.6384519128895750E+09, 1.5848048271166540E+10, 2.1031560281976685E+10, 1.5848048271166515E+10, 6.6384519128895721E+09, 1.4375660883001390E+09, 1.3701428307175830E+08, 3.9447202186642904E+06, 9.3397060605267870E+03}; - constexpr FLT c1[] = {4.0984512931817779E+04, 8.6828943763566837E+06, 1.9558432133067667E+08, 1.3674961320373521E+09, 3.9251291128182445E+09, 4.5116631434426517E+09, -5.2784645410468957E-07, -4.5116631434426460E+09, -3.9251291128182430E+09, -1.3674961320373495E+09, -1.9558432133067659E+08, -8.6828943763566315E+06, -4.0984512931817771E+04}; - constexpr FLT c2[] = {7.8379538318778941E+04, 8.4928073133582622E+06, 1.1992091153966446E+08, 5.0561697705436689E+08, 6.1845897311594033E+08, -5.1306326495404607E+08, -1.4790096327029381E+09, -5.1306326495404249E+08, 6.1845897311593974E+08, 5.0561697705436635E+08, 1.1992091153966436E+08, 8.4928073133582175E+06, 7.8379538318778941E+04}; - constexpr FLT c3[] = {8.6417670227040027E+04, 4.8250267333349725E+06, 3.9836803808039062E+07, 7.5026052902191281E+07, -7.7565422849559024E+07, -2.5393835488011667E+08, 3.3249826368607219E-06, 2.5393835488012213E+08, 7.7565422849558040E+07, -7.5026052902191922E+07, -3.9836803808038987E+07, -4.8250267333349492E+06, -8.6417670227040042E+04}; - constexpr FLT c4[] = {6.1161604972829395E+04, 1.7331203720075563E+06, 7.0216196997559210E+06, -3.6027138646115125E+06, -3.1775875626363419E+07, 1.6544480876799976E+06, 4.9816566960117713E+07, 1.6544480876825110E+06, -3.1775875626362957E+07, -3.6027138646109658E+06, 7.0216196997559462E+06, 1.7331203720075507E+06, 6.1161604972829424E+04}; - constexpr FLT c5[] = {2.9177164557155927E+04, 3.9318079134661297E+05, 3.1307448297762702E+05, -2.7571366584958737E+06, -9.8421840747392213E+05, 6.8469173866723683E+06, 2.8271164666996988E-07, -6.8469173866687613E+06, 9.8421840747752984E+05, 2.7571366584952055E+06, -3.1307448297760193E+05, -3.9318079134661169E+05, -2.9177164557155942E+04}; - constexpr FLT c6[] = {9.5097815505886592E+03, 4.8799940773717601E+04, -1.2734023162442955E+05, -2.5472337176560360E+05, 6.3596049196317361E+05, 2.2361868201724227E+05, -1.0716559939672153E+06, 2.2361868202200226E+05, 6.3596049196156661E+05, -2.5472337176510989E+05, -1.2734023162441404E+05, 4.8799940773715760E+04, 9.5097815505886429E+03}; - constexpr FLT c7[] = {2.0601715730545525E+03, 1.9365931141588459E+02, -2.5304303117500138E+04, 2.9151392447016315E+04, 5.9055020355996137E+04, -1.1784846181768291E+05, 2.6154044742765007E-06, 1.1784846181457305E+05, -5.9055020356659290E+04, -2.9151392447180453E+04, 2.5304303117533978E+04, -1.9365931141453160E+02, -2.0601715730545707E+03}; - constexpr FLT c8[] = {2.5975061893406377E+02, -1.0025387650570891E+03, -6.8642481197673135E+02, 6.7515314203707721E+03, -7.0772939651788483E+03, -6.5444514138990871E+03, 1.6566898963252905E+04, -6.5444514157945678E+03, -7.0772939632859488E+03, 6.7515314204902643E+03, -6.8642481194565551E+02, -1.0025387650535661E+03, 2.5975061893407650E+02}; - constexpr FLT c9[] = {5.8705282128692158E+00, -1.4424362302794552E+02, 3.3390627212323119E+02, 4.8151337259952918E+01, -1.1431733956368030E+03, 1.4557114776348812E+03, -3.3159944254032091E-07, -1.4557114806782522E+03, 1.1431733967780669E+03, -4.8151337378834590E+01, -3.3390627213511937E+02, 1.4424362302320881E+02, -5.8705282128605081E+00}; - constexpr FLT c10[] = {-4.0954969508851224E+00, -1.2634947171672739E+00, 3.8134139827368251E+01, -8.4115524684139231E+01, 4.2766848660349709E+01, 1.0573434367831015E+02, -1.9636661091449494E+02, 1.0573435467021281E+02, 4.2766847947710779E+01, -8.4115525105243464E+01, 3.8134139870558698E+01, -1.2634947126121756E+00, -4.0954969508837991E+00}; - constexpr FLT c11[] = {-6.2702735485690120E-01, 1.8595467760284645E+00, -1.3027978720941771E+00, -4.9265267037365117E+00, 1.3906831814366365E+01, -1.3753763493382712E+01, 2.6871064791607931E-07, 1.3753755542502716E+01, -1.3906831747296087E+01, 4.9265273573671839E+00, 1.3027978458757612E+00, -1.8595467797630605E+00, 6.2702735484380401E-01}; - constexpr FLT c12[] = {-4.8290636698016143E-02, 1.7531876457248552E-01, -5.0041296501579524E-01, 6.3665129689096389E-01, -1.2477021972354120E-02, -1.2061605995627183E+00, 1.8595304429529254E+00, -1.2061634758265700E+00, -1.2475794298747987E-02, 6.3665098120347430E-01, -5.0041293542010268E-01, 1.7531876909405444E-01, -4.8290636687311379E-02}; - constexpr FLT c13[] = {2.2894665623763296E-02, -7.1358251863425162E-03, -1.4950753078549017E-02, 7.0611554068321924E-02, -1.2311301880976686E-01, 1.0342486048127918E-01, -6.8988570158793749E-07, -1.0342802294420825E-01, 1.2311280070887519E-01, -7.0611922113576600E-02, 1.4950741151156504E-02, 7.1358201810974436E-03, -2.2894665619603353E-02}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; + constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; + constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; + constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; + constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; + constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; + constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; + constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; + constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; + constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; + constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; + constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; + for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); } else if (w==14) { - constexpr FLT c0[] = {1.3368785683552924E+04, 7.5304732752870303E+06, 3.2765764524435025E+08, 4.2418096936485295E+09, 2.4197690538177547E+10, 7.2227640697189728E+10, 1.2261475327356721E+11, 1.2261475327356729E+11, 7.2227640697189728E+10, 2.4197690538177608E+10, 4.2418096936485305E+09, 3.2765764524435204E+08, 7.5304732752870284E+06, 1.3368785683578022E+04}; - constexpr FLT c1[] = {6.1154444023081698E+04, 1.7488686085101545E+07, 5.0279014009863281E+08, 4.4777867842655859E+09, 1.6916819861812075E+10, 2.8971884004562843E+10, 1.6054555293734529E+10, -1.6054555293734520E+10, -2.8971884004562851E+10, -1.6916819861812094E+10, -4.4777867842655849E+09, -5.0279014009863436E+08, -1.7488686085101552E+07, -6.1154444023056109E+04}; - constexpr FLT c2[] = {1.2279790808348054E+05, 1.8230319600271538E+07, 3.3815815633684015E+08, 1.9369899011251259E+09, 3.9743454154781294E+09, 7.4954544638351953E+08, -7.0173920607394953E+09, -7.0173920607394981E+09, 7.4954544638350523E+08, 3.9743454154781094E+09, 1.9369899011251252E+09, 3.3815815633684099E+08, 1.8230319600271549E+07, 1.2279790808350702E+05}; - constexpr FLT c3[] = {1.4339321200624772E+05, 1.1200899688172197E+07, 1.2799140125169736E+08, 4.0176966726270700E+08, 7.9146174555817381E+07, -1.1719748245183482E+09, -9.6919138198233318E+08, 9.6919138198235631E+08, 1.1719748245183690E+09, -7.9146174555820629E+07, -4.0176966726270568E+08, -1.2799140125169775E+08, -1.1200899688172201E+07, -1.4339321200622563E+05}; - constexpr FLT c4[] = {1.0866548538632697E+05, 4.4565213401510660E+06, 2.8354150929531515E+07, 2.2805067924010411E+07, -1.2058223609888455E+08, -1.2775415620367479E+08, 1.9261201640091833E+08, 1.9261201640092278E+08, -1.2775415620368402E+08, -1.2058223609887798E+08, 2.2805067924010262E+07, 2.8354150929531977E+07, 4.4565213401510660E+06, 1.0866548538635395E+05}; - constexpr FLT c5[] = {5.6346565047794371E+04, 1.1743908345502394E+06, 3.0601086667308519E+06, -7.2274020134796854E+06, -1.6220595157138506E+07, 2.0773587344464455E+07, 2.8183198298702076E+07, -2.8183198298697799E+07, -2.0773587344463386E+07, 1.6220595157145990E+07, 7.2274020134800859E+06, -3.0601086667311694E+06, -1.1743908345502326E+06, -5.6346565047771030E+04}; - constexpr FLT c6[] = {2.0435142564639620E+04, 1.9450977300079435E+05, -1.1234667576916210E+05, -1.5205767549239143E+06, 1.0515640561116433E+06, 3.7458351782459249E+06, -3.3794074240140119E+06, -3.3794074240169711E+06, 3.7458351782412329E+06, 1.0515640561062016E+06, -1.5205767549244103E+06, -1.1234667576906871E+05, 1.9450977300078108E+05, 2.0435142564663318E+04}; - constexpr FLT c7[] = {5.1491366053560578E+03, 1.4735748500446980E+04, -8.1689482343558659E+04, -3.5176894225535718E+04, 3.7034248411029513E+05, -1.9109669530087037E+05, -5.2637978465954703E+05, 5.2637978466513811E+05, 1.9109669530731969E+05, -3.7034248412243859E+05, 3.5176894226134398E+04, 8.1689482343736949E+04, -1.4735748500440675E+04, -5.1491366053330503E+03}; - constexpr FLT c8[] = {8.5138795113642539E+02, -1.2978618911724870E+03, -8.7500873646799319E+03, 2.1319159614070901E+04, 7.6586611596445446E+03, -6.2424139814276627E+04, 4.2620771484048986E+04, 4.2620771487400976E+04, -6.2424139811762492E+04, 7.6586611726886877E+03, 2.1319159614126653E+04, -8.7500873648028410E+03, -1.2978618911666397E+03, 8.5138795115875746E+02}; - constexpr FLT c9[] = {7.2176142041601707E+01, -4.5543406154804239E+02, 2.8301959889246939E+02, 2.1994171513294418E+03, -4.5082500681007541E+03, 4.7658016701186381E+02, 7.1044827179414842E+03, -7.1044827207946446E+03, -4.7658016510975699E+02, 4.5082500692420190E+03, -2.1994171509014677E+03, -2.8301959872009093E+02, 4.5543406154544186E+02, -7.2176142022434362E+01}; - constexpr FLT c10[] = {-3.1135380162987940E+00, -3.8554406978579038E+01, 1.4396028115898400E+02, -1.1260050343554748E+02, -3.0073664795307559E+02, 7.2079162583931463E+02, -4.1195307853504261E+02, -4.1195308389061950E+02, 7.2079161951195317E+02, -3.0073665201295637E+02, -1.1260050330597517E+02, 1.4396028109959775E+02, -3.8554406977567140E+01, -3.1135379980017595E+00}; - constexpr FLT c11[] = {-1.6022934776926798E+00, 1.8678197421256739E+00, 8.3368944138930399E+00, -3.0791579027234270E+01, 3.4749714150762280E+01, 1.2322523792409507E+01, -7.3924012166427417E+01, 7.3924001493712765E+01, -1.2322523909478123E+01, -3.4749718994457659E+01, 3.0791578402870758E+01, -8.3368943163363198E+00, -1.8678197396867300E+00, 1.6022934951962213E+00}; - constexpr FLT c12[] = {-1.9362061844377096E-01, 6.3024467546449237E-01, -9.3262282246103156E-01, -4.8908745811188170E-01, 4.0479355563504544E+00, -6.2829791472071852E+00, 3.1767781035894589E+00, 3.1767769811448687E+00, -6.2829724125407163E+00, 4.0479411685726534E+00, -4.8908752826470542E-01, -9.3262301538118120E-01, 6.3024467436836862E-01, -1.9362060312354304E-01}; - constexpr FLT c13[] = {1.8785913715361053E-02, 3.1605272623671174E-02, -1.3655798799707175E-01, 2.5016548497515428E-01, -1.6654380378010236E-01, -2.1682631004979175E-01, 6.1785823408636587E-01, -6.1786412281044067E-01, 2.1682412904087514E-01, 1.6654140467029407E-01, -2.5016543044993139E-01, 1.3655803570664179E-01, -3.1605272197692873E-02, -1.8785905270673971E-02}; - constexpr FLT c14[] = {-1.2896545121493665E-02, -3.7106960851979211E-03, 5.8859140039070395E-04, 1.3987190631712249E-02, -3.5710919113872190E-02, 4.3405397573933885E-02, -2.0030939379906375E-02, -2.0032731865340953E-02, 4.3401439168598052E-02, -3.5712796955756618E-02, 1.3987489379284932E-02, 5.8862874383716927E-04, -3.7106965853333437E-03, -1.2896537371347905E-02}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; + constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; + constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; + constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; + constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; + constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; + constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; + constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; + constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; + constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; + constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; + constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; + constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; + for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==15) { - constexpr FLT c0[] = {1.8887777774374495E+04, 1.4015330434461441E+07, 7.5498683300180113E+08, 1.1900937739619959E+10, 8.2530965279375427E+10, 3.0178246269069617E+11, 6.3775691457119177E+11, 8.1471473119305627E+11, 6.3775691457119177E+11, 3.0178246269069659E+11, 8.2530965279375626E+10, 1.1900937739619970E+10, 7.5498683300180113E+08, 1.4015330434461441E+07, 1.8887777774374499E+04}; - constexpr FLT c1[] = {8.9780907163796350E+04, 3.4167636285297170E+07, 1.2346880033823483E+09, 1.3719272724135921E+10, 6.5858241494816727E+10, 1.5266999939989542E+11, 1.5687794513790732E+11, 8.2054309331652521E-05, -1.5687794513790729E+11, -1.5266999939989551E+11, -6.5858241494816811E+10, -1.3719272724135935E+10, -1.2346880033823485E+09, -3.4167636285297155E+07, -8.9780907163796262E+04}; - constexpr FLT c2[] = {1.8850321233130724E+05, 3.7693640983013548E+07, 8.9846818051570022E+08, 6.7094088040439663E+09, 1.9743296615199219E+10, 1.8072727219391186E+10, -2.0634615374559433E+10, -4.9654335197177406E+10, -2.0634615374559402E+10, 1.8072727219391071E+10, 1.9743296615199223E+10, 6.7094088040439653E+09, 8.9846818051569998E+08, 3.7693640983013526E+07, 1.8850321233130703E+05}; - constexpr FLT c3[] = {2.3185006533495741E+05, 2.4789475362741619E+07, 3.7751696829092431E+08, 1.7167916788178215E+09, 1.9832401267745426E+09, -3.4881359830883756E+09, -7.8785602379628572E+09, 9.7140016072625200E-05, 7.8785602379629736E+09, 3.4881359830884337E+09, -1.9832401267745149E+09, -1.7167916788178086E+09, -3.7751696829092413E+08, -2.4789475362741601E+07, -2.3185006533495741E+05}; - constexpr FLT c4[] = {1.8672970114818294E+05, 1.0741068109706741E+07, 9.8017949708492860E+07, 2.0291084954252207E+08, -2.7857869294215119E+08, -9.4112677968749356E+08, 1.7886520649348873E+08, 1.4579673547892964E+09, 1.7886520649342585E+08, -9.4112677968752539E+08, -2.7857869294214994E+08, 2.0291084954251558E+08, 9.8017949708492786E+07, 1.0741068109706741E+07, 1.8672970114818294E+05}; - constexpr FLT c5[] = {1.0411891611891470E+05, 3.1771463075269503E+06, 1.4880104152842240E+07, -6.8136965447559115E+06, -8.7072998215433106E+07, 1.8024116531034056E+06, 1.9067730799617344E+08, 4.2457739417067258E-05, -1.9067730799613068E+08, -1.8024116529409259E+06, 8.7072998215441659E+07, 6.8136965447553769E+06, -1.4880104152842039E+07, -3.1771463075269512E+06, -1.0411891611891471E+05}; - constexpr FLT c6[] = {4.1300641422694804E+04, 6.3217168592498475E+05, 7.7343707634861500E+05, -5.4575962381464886E+06, -3.7387211063140454E+06, 1.8451583614096310E+07, 3.0480804947991944E+06, -2.7500445095909819E+07, 3.0480804948348333E+06, 1.8451583614054784E+07, -3.7387211062913244E+06, -5.4575962381459959E+06, 7.7343707634824759E+05, 6.3217168592497776E+05, 4.1300641422694753E+04}; - constexpr FLT c7[] = {1.1710443348523793E+04, 7.5405449195728594E+04, -1.6634736996463325E+05, -5.6069290801800112E+05, 1.1540571564075467E+06, 1.0209821661192341E+06, -2.9641921942296810E+06, 3.3808352628184138E-05, 2.9641921942798980E+06, -1.0209821662794619E+06, -1.1540571563939669E+06, 5.6069290802062431E+05, 1.6634736996474760E+05, -7.5405449195719484E+04, -1.1710443348523821E+04}; - constexpr FLT c8[] = {2.3142324239350878E+03, 2.1710560541685127E+03, -3.6929625713073510E+04, 2.6143898219454975E+04, 1.4046980089280056E+05, -2.1033190113776314E+05, -1.1132269821056565E+05, 3.7491447377567255E+05, -1.1132269820392072E+05, -2.1033190119832297E+05, 1.4046980086087715E+05, 2.6143898218932318E+04, -3.6929625712961781E+04, 2.1710560541720374E+03, 2.3142324239350669E+03}; - constexpr FLT c9[] = {2.8879718294280184E+02, -9.2801372612475961E+02, -1.9817144426574330E+03, 9.9004179204792053E+03, -5.7928269087620147E+03, -2.1083466263505023E+04, 3.3285501948595454E+04, -2.7485328636422507E-05, -3.3285501965333991E+04, 2.1083466366979632E+04, 5.7928269521300508E+03, -9.9004179216204702E+03, 1.9817144428595318E+03, 9.2801372612847467E+02, -2.8879718294283089E+02}; - constexpr FLT c10[] = {1.3121871131812668E+01, -1.5978845116799533E+02, 2.7429718922951372E+02, 4.4598059414156506E+02, -1.8917609553066516E+03, 1.5303002688244715E+03, 1.7542368497545090E+03, -3.9411530602516441E+03, 1.7542369316431223E+03, 1.5303002442924305E+03, -1.8917609584163495E+03, 4.4598059457347478E+02, 2.7429718902435877E+02, -1.5978845117002061E+02, 1.3121871131803672E+01}; - constexpr FLT c11[] = {-2.4286151057240977E+00, -6.7839829107457454E+00, 4.6999223071396322E+01, -7.4896070961958642E+01, -3.2010113081168477E+01, 2.5022928265034139E+02, -2.8786059319143976E+02, -7.6634590881515742E-06, 2.8786055354435149E+02, -2.5022938574837804E+02, 3.2010133958326769E+01, 7.4896073537458122E+01, -4.6999222973839679E+01, 6.7839829144042234E+00, 2.4286151057002718E+00}; - constexpr FLT c12[] = {-5.4810555663540994E-01, 1.1436870829533889E+00, 8.2471503038810468E-01, -8.5602133190676231E+00, 1.5631626747736027E+01, -6.4979530690388971E+00, -1.8737705444912390E+01, 3.3283700586432069E+01, -1.8737671771580779E+01, -6.4980608237023150E+00, 1.5631576518348636E+01, -8.5602150728872868E+00, 8.2471496023535673E-01, 1.1436870829534245E+00, -5.4810555666110816E-01}; - constexpr FLT c13[] = {-1.4554612894071435E-02, 1.7022157798828938E-01, -3.7563883252838998E-01, 2.0131137597017346E-01, 8.3554102633770899E-01, -2.1191293316246047E+00, 1.9960663397068628E+00, -2.3728355667610635E-05, -1.9960994910423950E+00, 2.1191258420103383E+00, -8.3552532307350946E-01, -2.0131366602953590E-01, 3.7563888705361287E-01, -1.7022157564540871E-01, 1.4554612874103701E-02}; - constexpr FLT c14[] = {-1.2348455954758902E-02, 2.6143546776172359E-03, -2.9252135300577905E-02, 7.5391681327619392E-02, -8.7984403647335341E-02, 1.3344627281489669E-03, 1.5252941418184685E-01, -2.3235937480302737E-01, 1.5257226311939021E-01, 1.3278049251030887E-03, -8.7990378598784807E-02, 7.5392790961460260E-02, -2.9252188648358976E-02, 2.6143533439228375E-03, -1.2348455958015002E-02}; - constexpr FLT c15[] = {1.4214685601398354E-02, -1.2364336624800189E-03, 1.2892619016815934E-03, 1.6178062163508013E-03, -8.2136742192079667E-03, 1.3906385413195475E-02, -1.1450713230272313E-02, -3.7721726447119798E-06, 1.1423376007684534E-02, -1.3922509066323734E-02, 8.2263143670307064E-03, -1.6156663488059737E-03, -1.2892038432598459E-03, 1.2364357359950825E-03, -1.4214685605448193E-02}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; + constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; + constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; + constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; + constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; + constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; + constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; + constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; + constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; + constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; + constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; + constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; + constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; + for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); } else if (w==16) { - constexpr FLT c0[] = {2.6374086784014766E+04, 2.5501413681212697E+07, 1.6835469840840111E+09, 3.1953580806547901E+10, 2.6584910126662793E+11, 1.1715858191494631E+12, 3.0181658330343154E+12, 4.7888775408612793E+12, 4.7888775408612793E+12, 3.0181658330343149E+12, 1.1715858191494631E+12, 2.6584910126662802E+11, 3.1953580806547905E+10, 1.6835469840840123E+09, 2.5501413681212693E+07, 2.6374086784014838E+04}; - constexpr FLT c1[] = {1.2991568388123445E+05, 6.4986154651133671E+07, 2.9142305012947264E+09, 3.9748054433728172E+10, 2.3649443248440253E+11, 7.0471088240421252E+11, 1.0533888905987035E+12, 5.4832304482297614E+11, -5.4832304482297620E+11, -1.0533888905987037E+12, -7.0471088240421265E+11, -2.3649443248440253E+11, -3.9748054433728172E+10, -2.9142305012947268E+09, -6.4986154651133649E+07, -1.2991568388123452E+05}; - constexpr FLT c2[] = {2.8421223836872837E+05, 7.5448503558118597E+07, 2.2710828032883873E+09, 2.1491603403163834E+10, 8.4299374042308197E+10, 1.3384457365769531E+11, 1.8630012765538406E+09, -2.4384536789321063E+11, -2.4384536789321036E+11, 1.8630012765533686E+09, 1.3384457365769537E+11, 8.4299374042308105E+10, 2.1491603403163818E+10, 2.2710828032883859E+09, 7.5448503558118537E+07, 2.8421223836872837E+05}; - constexpr FLT c3[] = {3.6653021243297530E+05, 5.2693428548387125E+07, 1.0410094433021290E+09, 6.3986267576853638E+09, 1.3313926739756351E+10, -2.7909761561126175E+09, -3.9911638977027939E+10, -2.9236947704012280E+10, 2.9236947704013081E+10, 3.9911638977028137E+10, 2.7909761561130028E+09, -1.3313926739756271E+10, -6.3986267576853542E+09, -1.0410094433021282E+09, -5.2693428548387118E+07, -3.6653021243297530E+05}; - constexpr FLT c4[] = {3.1185660915838124E+05, 2.4564274645530283E+07, 3.0509279143241888E+08, 1.0432225146182600E+09, 6.4966284440289930E+07, -4.2483903608015141E+09, -3.1778261722520151E+09, 5.9880587942837610E+09, 5.9880587942838221E+09, -3.1778261722524805E+09, -4.2483903608015366E+09, 6.4966284440239742E+07, 1.0432225146182716E+09, 3.0509279143241870E+08, 2.4564274645530298E+07, 3.1185660915838124E+05}; - constexpr FLT c5[] = {1.8544733523229556E+05, 7.9824949938292857E+06, 5.6880943382648587E+07, 5.4097201999261037E+07, -3.0776449202831459E+08, -3.7659931821870732E+08, 6.8797698944740057E+08, 7.5429896889854825E+08, -7.5429896889813769E+08, -6.8797698944685316E+08, 3.7659931821880990E+08, 3.0776449202837443E+08, -5.4097201999261037E+07, -5.6880943382648058E+07, -7.9824949938292904E+06, -1.8544733523229562E+05}; - constexpr FLT c6[] = {7.9472339236673346E+04, 1.8159676553648554E+06, 5.7259818806757703E+06, -1.2786136236414703E+07, -3.8677490873126298E+07, 4.7651450515746824E+07, 9.0723760109486386E+07, -9.4532949239712372E+07, -9.4532949239553988E+07, 9.0723760109301269E+07, 4.7651450515691362E+07, -3.8677490873146154E+07, -1.2786136236417659E+07, 5.7259818806749191E+06, 1.8159676553648303E+06, 7.9472339236673288E+04}; - constexpr FLT c7[] = {2.4831718998299966E+04, 2.7536301841718081E+05, -5.1045953355375612E+04, -2.6996387880195463E+06, 1.1656554632389303E+06, 9.1521923450131379E+06, -6.8198180924866442E+06, -1.2555197000819867E+07, 1.2555197001241650E+07, 6.8198180927697066E+06, -9.1521923448700085E+06, -1.1656554631878142E+06, 2.6996387880213680E+06, 5.1045953356119258E+04, -2.7536301841717307E+05, -2.4831718998299926E+04}; - constexpr FLT c8[] = {5.6060763597396308E+03, 2.2154740880106889E+04, -1.0243462874801211E+05, -1.1802198892514131E+05, 6.4061699367996352E+05, -1.1166716767206143E+05, -1.4153578101430011E+06, 1.0790712966724981E+06, 1.0790712967259965E+06, -1.4153578105201155E+06, -1.1166716749694763E+05, 6.4061699367337034E+05, -1.1802198891465126E+05, -1.0243462874806672E+05, 2.2154740880108289E+04, 5.6060763597395980E+03}; - constexpr FLT c9[] = {8.7271993222052015E+02, -7.0074676858636565E+02, -1.2528372958260919E+04, 2.3643101058174649E+04, 3.1699060176870429E+04, -1.1270133590467999E+05, 3.6872846694334214E+04, 1.5168911740364679E+05, -1.5168911743408049E+05, -3.6872846682160729E+04, 1.1270133589250650E+05, -3.1699060125133125E+04, -2.3643101053990013E+04, 1.2528372958926657E+04, 7.0074676859379576E+02, -8.7271993222046206E+02}; - constexpr FLT c10[] = {7.8842259458809167E+01, -4.2070880912368045E+02, -1.0535142084668550E+02, 3.3375056840527291E+03, -4.9426353391946941E+03, -3.6567309106352213E+03, 1.5199085303756190E+04, -9.4972223386509122E+03, -9.4972222612539845E+03, 1.5199085250589107E+04, -3.6567308608802218E+03, -4.9426353295200679E+03, 3.3375056868169195E+03, -1.0535142136497778E+02, -4.2070880912233122E+02, 7.8842259458809863E+01}; - constexpr FLT c11[] = {8.9833076822322541E-02, -4.4163371176090656E+01, 1.2880771155499514E+02, 2.8722193371824223E+00, -5.7164633743445722E+02, 9.0417612969072786E+02, 1.1220387898916500E+00, -1.4190926236781661E+03, 1.4190921497862169E+03, -1.1219395160922474E+00, -9.0417626783116691E+02, 5.7164631339646269E+02, -2.8722233955477368E+00, -1.2880771178913139E+02, 4.4163371168774162E+01, -8.9833076836661779E-02}; - constexpr FLT c12[] = {-1.0900468357478950E+00, -1.1264666525354303E-01, 1.1810668147959248E+01, -3.0289105313513339E+01, 1.5494580774353590E+01, 6.0129886123389447E+01, -1.2330199171381130E+02, 6.7114507519752891E+01, 6.7114417724195803E+01, -1.2330220722314033E+02, 6.0129944490502041E+01, 1.5494578529464169E+01, -3.0289104892597450E+01, 1.1810668147959559E+01, -1.1264666963803399E-01, -1.0900468357479236E+00}; - constexpr FLT c13[] = {-1.1763610120003680E-01, 4.2939195911805172E-01, -2.7950209959937194E-01, -1.7354549670508441E+00, 5.1182015415147619E+00, -5.0538827161604676E+00, -2.1270036462171213E+00, 1.0709458682620088E+01, -1.0709612225647817E+01, 2.1267942693611270E+00, 5.0538338615607357E+00, -5.1181806038291624E+00, 1.7354571480597607E+00, 2.7950229043765212E-01, -4.2939195443229039E-01, 1.1763610122666045E-01}; - constexpr FLT c14[] = {-1.8020499668410097E-02, 3.6694580839244442E-02, -1.1331134794057113E-01, 1.3971228975695787E-01, 8.1734604430561311E-02, -5.4464516301492671E-01, 7.9646109231150031E-01, -3.9024149191964747E-01, -3.9020325223035940E-01, 7.9644613359376126E-01, -5.4458780348100966E-01, 8.1735287282159258E-02, 1.3971280189565236E-01, -1.1331156133169454E-01, 3.6694584840328316E-02, -1.8020499652780946E-02}; - constexpr FLT c15[] = {1.4589783473923206E-02, -7.8885429103313365E-04, -4.4856766056362643E-03, 1.8116483572926646E-02, -3.0574294775135746E-02, 1.8967420978453962E-02, 2.4666137072064612E-02, -6.8017929307730221E-02, 6.7615302446897660E-02, -2.4691085605299815E-02, -1.9038882601578176E-02, 3.0552398456072709E-02, -1.8118938614760938E-02, 4.4854443719491892E-03, 7.8884755210919307E-04, -1.4589783498222219E-02}; - constexpr FLT c16[] = {-1.0467998078291846E-02, -3.2140608463710125E-04, 5.2959666930518063E-04, -1.5769844275261027E-04, -1.4331371817542763E-03, 3.7100687637655694E-03, -3.8742310984482158E-03, 1.6810223071268796E-03, 1.6547563335702548E-03, -3.9924279794162345E-03, 3.6969357769948610E-03, -1.4380620517984166E-03, -1.5934006609813836E-04, 5.2953895598459668E-04, -3.2140848935911386E-04, -1.0467998075160606E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; + constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; + constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; + constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; + constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; + constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; + constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; + constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; + constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; + constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; + constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; + constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; + constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; + constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); } else printf("width not implemented!\n"); diff --git a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc b/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc deleted file mode 100644 index e2fa229b7..000000000 --- a/include/cufinufft/contrib/ker_lowupsampfac_horner_allw_loop_constexpr.inc +++ /dev/null @@ -1,171 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {6.1209111871385702E-01, 6.1209111871385702E-01}; - constexpr FLT c1[] = {6.4742429432896431E-01, -6.4742429432896442E-01}; - constexpr FLT c2[] = {-9.0411309581634847E-02, -9.0411309581634750E-02}; - constexpr FLT c3[] = {-1.9075708590566751E-01, 1.9075708590566753E-01}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - constexpr FLT c0[] = {2.4728112933307078E-01, 1.0000000000000044E+00, 2.4728112935494964E-01}; - constexpr FLT c1[] = {4.0470611346184543E-01, 2.1212921335912390E-17, -4.0470611343822160E-01}; - constexpr FLT c2[] = {1.4864411342268655E-01, -3.0473448739822773E-01, 1.4864411344492173E-01}; - constexpr FLT c3[] = {-4.4469294619149627E-02, 1.3598904496642886E-16, 4.4469294640111616E-02}; - constexpr FLT c4[] = {-2.9270010751775037E-02, 3.7966707032750659E-02, -2.9270010728701147E-02}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==4) { - constexpr FLT c0[] = {8.4048892491849839E-02, 7.9275732207620875E-01, 7.9275732207620908E-01, 8.4048892491849811E-02}; - constexpr FLT c1[] = {1.7431588385887239E-01, 3.7425489538028417E-01, -3.7425489538028422E-01, -1.7431588385887242E-01}; - constexpr FLT c2[] = {1.1425598262146337E-01, -1.1126112046907141E-01, -1.1126112046907137E-01, 1.1425598262146335E-01}; - constexpr FLT c3[] = {1.5677587697716072E-02, -6.7022293289915616E-02, 6.7022293289915727E-02, -1.5677587697716041E-02}; - constexpr FLT c4[] = {-1.0401300825285629E-02, 6.3725646657139309E-03, 6.3725646657139005E-03, -1.0401300825285625E-02}; - constexpr FLT c5[] = {-3.0464394190490617E-03, 5.3247889205097435E-03, -5.3247889205097279E-03, 3.0464394190490305E-03}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==5) { - constexpr FLT c0[] = {2.5811126752233307E-02, 4.6616226852477344E-01, 1.0000000000000007E+00, 4.6616226852477305E-01, 2.5811126752233318E-02}; - constexpr FLT c1[] = {6.2936773057387055E-02, 3.7198919402374020E-01, 2.1212921335912559E-17, -3.7198919402374009E-01, -6.2936773057387055E-02}; - constexpr FLT c2[] = {5.4855980576944567E-02, 3.7709308632020676E-02, -1.8284069243892637E-01, 3.7709308632020731E-02, 5.4855980576944567E-02}; - constexpr FLT c3[] = {1.8780973157032140E-02, -3.8322611720715660E-02, 1.4047484462204681E-16, 3.8322611720715834E-02, -1.8780973157032116E-02}; - constexpr FLT c4[] = {-2.3306908700105430E-05, -8.3858973028989436E-03, 1.4886952481383787E-02, -8.3858973028988499E-03, -2.3306908700106227E-05}; - constexpr FLT c5[] = {-1.5212353034889806E-03, 1.7151925122365422E-03, 1.0734071182258885E-16, -1.7151925122365888E-03, 1.5212353034889806E-03}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==6) { - constexpr FLT c0[] = {7.3992041846532818E-03, 2.2998056434514028E-01, 8.5775196559356059E-01, 8.5775196559356115E-01, 2.2998056434514028E-01, 7.3992041847816166E-03}; - constexpr FLT c1[] = {2.0397684222696250E-02, 2.4277466601214742E-01, 2.6509440217151281E-01, -2.6509440217151231E-01, -2.4277466601214739E-01, -2.0397684222557694E-02}; - constexpr FLT c2[] = {2.1435449512033435E-02, 7.4190333865239946E-02, -9.5369600014193256E-02, -9.5369600014193381E-02, 7.4190333865239905E-02, 2.1435449512163876E-02}; - constexpr FLT c3[] = {1.0463664645794037E-02, -5.8671703446042224E-03, -3.4019677093840447E-02, 3.4019677093840760E-02, 5.8671703446042771E-03, -1.0463664645671082E-02}; - constexpr FLT c4[] = {1.9378826192716972E-03, -6.8365127179467735E-03, 4.7406536657957962E-03, 4.7406536657958473E-03, -6.8365127179467848E-03, 1.9378826194070377E-03}; - constexpr FLT c5[] = {-2.6471424081647417E-04, -5.6150758897069279E-04, 2.0099203466671291E-03, -2.0099203466670359E-03, 5.6150758897070829E-04, 2.6471424094083520E-04}; - constexpr FLT c6[] = {-1.6161497824910217E-04, 2.5924418389355766E-04, -1.3917099193215483E-04, -1.3917099193211840E-04, 2.5924418389357192E-04, -1.6161497812639921E-04}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==7) { - constexpr FLT c0[] = {2.0163149398992283E-03, 1.0071602557045130E-01, 5.8653557849806126E-01, 1.0000000000000002E+00, 5.8653557849806159E-01, 1.0071602557045131E-01, 2.0163149399332597E-03}; - constexpr FLT c1[] = {6.1353661835569211E-03, 1.2822551681002711E-01, 3.1973557271594344E-01, -2.1212921335912596E-17, -3.1973557271594366E-01, -1.2822551681002711E-01, -6.1353661835202118E-03}; - constexpr FLT c2[] = {7.4065234100227761E-03, 5.7825030729344404E-02, 1.0889852837592919E-04, -1.3060049459923276E-01, 1.0889852837575314E-04, 5.7825030729344355E-02, 7.4065234100573725E-03}; - constexpr FLT c3[] = {4.4924606632387705E-03, 7.2245566707421303E-03, -2.7743312484355583E-02, 1.0559644416237177E-16, 2.7743312484355832E-02, -7.2245566707420826E-03, -4.4924606632061881E-03}; - constexpr FLT c4[] = {1.3572774007773842E-03, -2.3954706749181320E-03, -2.9058644824981098E-03, 7.8619155407045772E-03, -2.9058644824980807E-03, -2.3954706749181507E-03, 1.3572774008132615E-03}; - constexpr FLT c5[] = {1.1260116639581618E-04, -7.8814564904709067E-04, 1.1036556706849172E-03, -3.0492924261508591E-17, -1.1036556706849482E-03, 7.8814564904710227E-04, -1.1260116636284763E-04}; - constexpr FLT c6[] = {-4.7399003259805808E-05, 2.0950491943152726E-06, 1.7484854214667859E-04, -2.9104069274769336E-04, 1.7484854214659272E-04, 2.0950491943114936E-06, -4.7399003227280901E-05}; - constexpr FLT c7[] = {-1.2555096177146811E-05, 2.7293834771974277E-05, -2.6660039700396876E-05, 5.1878356274645480E-17, 2.6660039700612832E-05, -2.7293834771939816E-05, 1.2555096209061404E-05}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==8) { - constexpr FLT c0[] = {5.2827275612461462E-04, 4.0402734444109238E-02, 3.4389230803369686E-01, 8.9161099745784866E-01, 8.9161099745784866E-01, 3.4389230803369708E-01, 4.0402734444109252E-02, 5.2827275612461408E-04}; - constexpr FLT c1[] = {1.7458301875074096E-03, 5.9145446836664541E-02, 2.5435204236257858E-01, 2.0538938722823222E-01, -2.0538938722823233E-01, -2.5435204236257858E-01, -5.9145446836664547E-02, -1.7458301875074094E-03}; - constexpr FLT c2[] = {2.3525728171808306E-03, 3.3585505340219701E-02, 4.4733940386002209E-02, -8.0668262921248624E-02, -8.0668262921248748E-02, 4.4733940386002119E-02, 3.3585505340219687E-02, 2.3525728171808311E-03}; - constexpr FLT c3[] = {1.6676293877589678E-03, 8.1606118103203940E-03, -1.0603838868224419E-02, -2.0559571166483725E-02, 2.0559571166484002E-02, 1.0603838868224510E-02, -8.1606118103203749E-03, -1.6676293877589678E-03}; - constexpr FLT c4[] = {6.5470478006265378E-04, 5.7029826102775656E-05, -4.0842122325118182E-03, 3.3746160664395084E-03, 3.3746160664396086E-03, -4.0842122325118321E-03, 5.7029826102778678E-05, 6.5470478006265432E-04}; - constexpr FLT c5[] = {1.2504911757628686E-04, -3.9351755557266000E-04, 2.3739384784447216E-05, 9.6592347103022203E-04, -9.6592347103013649E-04, -2.3739384784439440E-05, 3.9351755557266586E-04, -1.2504911757628702E-04}; - constexpr FLT c6[] = {-6.5665874015798238E-07, -6.1884865695206891E-05, 1.4476791315356577E-04, -8.6782118193344350E-05, -8.6782118193318939E-05, 1.4476791315358196E-04, -6.1884865695214169E-05, -6.5665874015806602E-07}; - constexpr FLT c7[] = {-5.1256159860509675E-06, 5.3292178505898186E-06, 8.7427989025457230E-06, -2.8404799465047339E-05, 2.8404799465135336E-05, -8.7427989024875505E-06, -5.3292178505782125E-06, 5.1256159860509675E-06}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3409415535124456E-04, 1.5141199617983757E-02, 1.8004032483820079E-01, 6.6268423293859657E-01, 1.0000000000000004E+00, 6.6268423293859746E-01, 1.8004032483820084E-01, 1.5141199617983828E-02, 1.3409415535124450E-04}; - constexpr FLT c1[] = {4.7572953640583401E-04, 2.4761567630011042E-02, 1.6332247709293549E-01, 2.7616213278983226E-01, -4.2425842671825223E-17, -2.7616213278983237E-01, -1.6332247709293549E-01, -2.4761567630011111E-02, -4.7572953640583401E-04}; - constexpr FLT c2[] = {7.0217948741779855E-04, 1.6533012331430421E-02, 4.8637875368588490E-02, -1.5084170630533007E-02, -1.0157816246606997E-01, -1.5084170630533338E-02, 4.8637875368588449E-02, 1.6533012331430445E-02, 7.0217948741779833E-04}; - constexpr FLT c3[] = {5.6197289626769645E-04, 5.4583505067803007E-03, 8.8722695781044485E-04, -2.0386313118366230E-02, 1.4346537772579219E-16, 2.0386313118366597E-02, -8.8722695781040203E-04, -5.4583505067802999E-03, -5.6197289626769645E-04}; - constexpr FLT c4[] = {2.6358216867957524E-04, 7.0803132065997147E-04, -2.3883045659485441E-03, -1.0047843626593360E-03, 4.8455486978739078E-03, -1.0047843626590051E-03, -2.3883045659485362E-03, 7.0803132065996898E-04, 2.6358216867957530E-04}; - constexpr FLT c5[] = {7.0565721004957831E-05, -9.0876125855045856E-05, -3.5965836571493702E-04, 7.0575785995728897E-04, 5.6006957738110937E-17, -7.0575785995746006E-04, 3.5965836571493702E-04, 9.0876125855046818E-05, -7.0565721004957980E-05}; - constexpr FLT c6[] = {7.9668965137354764E-06, -4.2137454928171943E-05, 3.9856859670063718E-05, 6.5639620808911507E-05, -1.4477186949841611E-04, 6.5639620808762402E-05, 3.9856859670072629E-05, -4.2137454928186349E-05, 7.9668965137352681E-06}; - constexpr FLT c7[] = {-9.3772917893888351E-07, -3.0575635011675480E-06, 1.2977675432514170E-05, -1.5241881422267232E-05, 5.6444540850624641E-17, 1.5241881422464882E-05, -1.2977675432482811E-05, 3.0575635011824812E-06, 9.3772917893893782E-07}; - constexpr FLT c8[] = {-4.1446092652958961E-07, 7.2790527337844100E-07, -2.5130319764268858E-08, -1.9002349621010172E-06, 3.0493470976000790E-06, -1.9002349619116138E-06, -2.5130319761051126E-08, 7.2790527337217009E-07, -4.1446092652952507E-07}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==10) { - constexpr FLT c0[] = {3.3157481538170295E-05, 5.3715860775974443E-03, 8.6328042282845782E-02, 4.3077092326437988E-01, 9.1242439930731112E-01, 9.1242439930731112E-01, 4.3077092326437971E-01, 8.6328042282845754E-02, 5.3715860775974227E-03, 3.3157481538170322E-05}; - constexpr FLT c1[] = {1.2517797191066981E-04, 9.6269418565961412E-03, 9.1130577457178452E-02, 2.4769645835465362E-01, 1.6766875916810517E-01, -1.6766875916810536E-01, -2.4769645835465354E-01, -9.1130577457178424E-02, -9.6269418565961117E-03, -1.2517797191066951E-04}; - constexpr FLT c2[] = {1.9968216068682153E-04, 7.2783782301876591E-03, 3.5949398124193940E-02, 2.5847993600195553E-02, -6.9275634160640490E-02, -6.9275634160640504E-02, 2.5847993600195445E-02, 3.5949398124193913E-02, 7.2783782301876375E-03, 1.9968216068682094E-04}; - constexpr FLT c3[] = {1.7649923565147242E-04, 2.9221990881931090E-03, 4.9086823797165058E-03, -1.0940556313145914E-02, -1.3762152424114656E-02, 1.3762152424114910E-02, 1.0940556313146081E-02, -4.9086823797164919E-03, -2.9221990881930998E-03, -1.7649923565147204E-04}; - constexpr FLT c4[] = {9.4710355505531920E-05, 6.0621452710061727E-04, -7.0118560592788729E-04, -2.4750745659639179E-03, 2.4757076628501668E-03, 2.4757076628502063E-03, -2.4750745659640264E-03, -7.0118560592788274E-04, 6.0621452710061163E-04, 9.4710355505531771E-05}; - constexpr FLT c5[] = {3.1258610702677804E-05, 2.8169545035126350E-05, -2.9881406711974808E-04, 1.5956798534243302E-04, 5.3653099874326161E-04, -5.3653099874339388E-04, -1.5956798534226972E-04, 2.9881406711975192E-04, -2.8169545035121488E-05, -3.1258610702677743E-05}; - constexpr FLT c6[] = {5.7780052154065432E-06, -1.5636835808661990E-05, -1.6121807313036067E-05, 8.1230533420465018E-05, -5.5456530742754838E-05, -5.5456530742851827E-05, 8.1230533420445272E-05, -1.6121807313045130E-05, -1.5636835808665131E-05, 5.7780052154064593E-06}; - constexpr FLT c7[] = {2.7742147829406768E-07, -3.2550081973304980E-06, 5.9212960378031332E-06, 8.5495977199682674E-07, -1.3248468528032551E-05, 1.3248468528215217E-05, -8.5495977185729702E-07, -5.9212960377964950E-06, 3.2550081973313239E-06, -2.7742147829400097E-07}; - constexpr FLT c8[] = {-1.2089379439825852E-07, -3.4743143855784781E-08, 8.2889801006379481E-07, -1.5830293785226849E-06, 8.7461219388985494E-07, 8.7461219397529632E-07, -1.5830293786451511E-06, 8.2889801008534534E-07, -3.4743143855462353E-08, -1.2089379439833804E-07}; - constexpr FLT c9[] = {-2.5033479260872450E-08, 6.3042298326687954E-08, -5.2303271559903752E-08, -7.6226091757998386E-08, 2.3316553102767969E-07, -2.3316553111902137E-07, 7.6226091879787297E-08, 5.2303271554367896E-08, -6.3042298324957995E-08, 2.5033479260965031E-08}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==11) { - constexpr FLT c0[] = {8.0191950887587638E-06, 1.8211144887695905E-03, 3.8565497751765702E-02, 2.5236459439543663E-01, 7.1517256669690443E-01, 1.0000000000000002E+00, 7.1517256669690443E-01, 2.5236459439543651E-01, 3.8565497751765723E-02, 1.8211144887695927E-03, 8.0191950887586707E-06}; - constexpr FLT c1[] = {3.1996260415636073E-05, 3.5282769389657661E-03, 4.5889527487056492E-02, 1.8012194355267480E-01, 2.4178022040260394E-01, 2.1212921335912587E-17, -2.4178022040260411E-01, -1.8012194355267488E-01, -4.5889527487056492E-02, -3.5282769389657648E-03, -3.1996260415635850E-05}; - constexpr FLT c2[] = {5.4612928019025183E-05, 2.9497743530118290E-03, 2.1858479505161201E-02, 3.8333708936616528E-02, -2.1641923687039297E-02, -8.3109405654057292E-02, -2.1641923687039287E-02, 3.8333708936616487E-02, 2.1858479505161187E-02, 2.9497743530118290E-03, 5.4612928019024885E-05}; - constexpr FLT c3[] = {5.2504054888010150E-05, 1.3660648269306127E-03, 4.7357572177382694E-03, -2.2373255422688926E-03, -1.5459233729560824E-02, -3.0584997651941540E-18, 1.5459233729561050E-02, 2.2373255422689746E-03, -4.7357572177382599E-03, -1.3660648269306129E-03, -5.2504054888009953E-05}; - constexpr FLT c4[] = {3.1396100602888584E-05, 3.6443237253636144E-04, 1.5906780001786821E-04, -1.9495384184342716E-03, -2.4621376046556434E-04, 3.2818730060399505E-03, -2.4621376046541547E-04, -1.9495384184342974E-03, 1.5906780001787157E-04, 3.6443237253636144E-04, 3.1396100602888483E-05}; - constexpr FLT c5[] = {1.2057435171015750E-05, 4.6687328398363315E-05, -1.3963494372747466E-04, -1.4877651674418741E-04, 4.6954815721697059E-04, 7.1576260535837041E-17, -4.6954815721696283E-04, 1.4877651674414852E-04, 1.3963494372747659E-04, -4.6687328398363071E-05, -1.2057435171015728E-05}; - constexpr FLT c6[] = {2.8888404081262488E-06, -1.8976367884800935E-06, -2.4767547607257735E-05, 3.8337725458133611E-05, 2.6462355617055980E-05, -8.2113719362939881E-05, 2.6462355617066876E-05, 3.8337725458138978E-05, -2.4767547607262269E-05, -1.8976367884805327E-06, 2.8888404081262340E-06}; - constexpr FLT c7[] = {3.5729663467786725E-07, -1.6085054296206689E-06, 4.5672370507959851E-07, 6.0608527683273524E-06, -9.0233724844644286E-06, -4.5070818825954386E-17, 9.0233724845159214E-06, -6.0608527682667218E-06, -4.5672370507254818E-07, 1.6085054296207723E-06, -3.5729663467788907E-07}; - constexpr FLT c8[] = {-7.7890073973236871E-09, -1.8340559948709468E-07, 5.4451797328971916E-07, -3.5830285713854766E-07, -7.3873233537913819E-07, 1.4648976903075259E-06, -7.3873233536710514E-07, -3.5830285713236262E-07, 5.4451797329704790E-07, -1.8340559948689703E-07, -7.7890073973081013E-09}; - constexpr FLT c9[] = {-9.8984999695252047E-09, 1.0194946774280524E-08, 3.5279000677512062E-08, -1.1638771469313311E-07, 1.2326133617211816E-07, -2.5669371006274292E-17, -1.2326133615551060E-07, 1.1638771463500659E-07, -3.5279000676820083E-08, -1.0194946774410270E-08, 9.8984999695130418E-09}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==12) { - constexpr FLT c0[] = {1.9028495068410023E-06, 5.9416527261081913E-04, 1.6248140264385581E-02, 1.3597036436097915E-01, 4.9821957378204840E-01, 9.2652305802242962E-01, 9.2652305802242962E-01, 4.9821957378204840E-01, 1.3597036436097937E-01, 1.6248140264385626E-02, 5.9416527261081924E-04, 1.9028495068454171E-06}; - constexpr FLT c1[] = {7.9801239249145923E-06, 1.2318344820958854E-03, 2.1335987794357199E-02, 1.1394981969310448E-01, 2.3520579283187484E-01, 1.4166451219687695E-01, -1.4166451219687687E-01, -2.3520579283187476E-01, -1.1394981969310460E-01, -2.1335987794357230E-02, -1.2318344820958847E-03, -7.9801239249098540E-06}; - constexpr FLT c2[] = {1.4462226804444730E-05, 1.1205076408888257E-03, 1.1698445222077612E-02, 3.3958877046121660E-02, 1.3705098421608795E-02, -6.0497400607811481E-02, -6.0497400607811579E-02, 1.3705098421608806E-02, 3.3958877046121591E-02, 1.1698445222077622E-02, 1.1205076408888255E-03, 1.4462226804449267E-05}; - constexpr FLT c3[] = {1.4953735432776090E-05, 5.8049865432805142E-04, 3.2684769908807722E-03, 2.3619245295514353E-03, -1.0074268581043095E-02, -9.8551520939611746E-03, 9.8551520939615059E-03, 1.0074268581043251E-02, -2.3619245295513252E-03, -3.2684769908807648E-03, -5.8049865432805098E-04, -1.4953735432771914E-05}; - constexpr FLT c4[] = {9.7900673700200676E-06, 1.8351475200221906E-04, 3.8725987583789238E-04, -9.2229408802588448E-04, -1.5383560041742387E-03, 1.8800996948122926E-03, 1.8800996948123033E-03, -1.5383560041742409E-03, -9.2229408802591614E-04, 3.8725987583789064E-04, 1.8351475200221903E-04, 9.7900673700247601E-06}; - constexpr FLT c5[] = {4.2345162286123928E-06, 3.3664241555334181E-05, -3.0535096226552352E-05, -1.9795772057290591E-04, 1.7526295499606013E-04, 3.2830037656743561E-04, -3.2830037656734232E-04, -1.7526295499599014E-04, 1.9795772057292925E-04, 3.0535096226555273E-05, -3.3664241555334181E-05, -4.2345162286081255E-06}; - constexpr FLT c6[] = {1.2088615636792351E-06, 2.2204932634073669E-06, -1.5559909809157569E-05, 1.8771595438708362E-06, 4.7304527720902187E-05, -3.7055029721502823E-05, -3.7055029721506354E-05, 4.7304527720948991E-05, 1.8771595438366184E-06, -1.5559909809165219E-05, 2.2204932634074313E-06, 1.2088615636834544E-06}; - constexpr FLT c7[] = {2.1206307767331379E-07, -4.5869687934383747E-07, -1.3462277877507893E-06, 4.2970047520348418E-06, -1.1214870287581008E-06, -6.9831974682071699E-06, 6.9831974683366982E-06, 1.1214870288087690E-06, -4.2970047519748465E-06, 1.3462277877599186E-06, 4.5869687934394192E-07, -2.1206307766917122E-07}; - constexpr FLT c8[] = {1.5395324498807062E-08, -1.2022118042093087E-07, 1.5464523856613661E-07, 2.7605497716337475E-07, -8.4964626033234966E-07, 5.2067203458077506E-07, 5.2067203461734952E-07, -8.4964626032018743E-07, 2.7605497716040193E-07, 1.5464523856098652E-07, -1.2022118042095769E-07, 1.5395324502815322E-08}; - constexpr FLT c9[] = {-2.0816585198648028E-09, -6.8192670389370156E-09, 3.6338774649049193E-08, -4.9464520974759579E-08, -1.3242031035521981E-08, 1.0671664854533778E-07, -1.0671664854533778E-07, 1.3242031024450263E-08, 4.9464520977527511E-08, -3.6338774639015446E-08, 6.8192670391856967E-09, 2.0816585232951501E-09}; - constexpr FLT c10[] = {-6.3791929313390708E-10, 1.2240176132927394E-09, 5.3586930472778203E-10, -6.2807355748408205E-09, 1.0600657362033408E-08, -5.5585207892891946E-09, -5.5585208232281016E-09, 1.0600657414513137E-08, -6.2807355547288652E-09, 5.3586929184356377E-10, 1.2240176133909372E-09, -6.3791928984134277E-10}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==13) { - constexpr FLT c0[] = {4.4408051211162946E-07, 1.8756193861873427E-04, 6.5146989208011716E-03, 6.8352802598867876E-02, 3.1564238810082484E-01, 7.5353649746793960E-01, 9.9999999999999956E-01, 7.5353649746793838E-01, 3.1564238810082484E-01, 6.8352802598867710E-02, 6.5146989208011707E-03, 1.8756193861873272E-04, 4.4408051211162761E-07}; - constexpr FLT c1[] = {1.9487148068106057E-06, 4.1285069961250701E-04, 9.2995630713278762E-03, 6.5021145064983563E-02, 1.8663042875530009E-01, 2.1451870821533808E-01, 1.8840858949353919E-32, -2.1451870821533794E-01, -1.8663042875529998E-01, -6.5021145064983438E-02, -9.2995630713278762E-03, -4.1285069961250425E-04, -1.9487148068106044E-06}; - constexpr FLT c2[] = {3.7267581324409626E-06, 4.0381251792508734E-04, 5.7019503038218408E-03, 2.4040868593456825E-02, 2.9406233528281710E-02, -2.4394921635639378E-02, -7.0323343245740924E-02, -2.4394921635639052E-02, 2.9406233528281724E-02, 2.4040868593456791E-02, 5.7019503038218382E-03, 4.0381251792508501E-04, 3.7267581324409626E-06}; - constexpr FLT c3[] = {4.1089519307370168E-06, 2.2941839162878727E-04, 1.8941440042457443E-03, 3.5673079836347822E-03, -3.6880489041048953E-03, -1.2074156718545214E-02, 7.1013810712957114E-17, 1.2074156718545436E-02, 3.6880489041048944E-03, -3.5673079836347674E-03, -1.8941440042457413E-03, -2.2941839162878624E-04, -4.1089519307370151E-06}; - constexpr FLT c4[] = {2.9080869014384424E-06, 8.2405696428180906E-05, 3.3386109283452779E-04, -1.7130036080580219E-04, -1.5108662980936900E-03, 7.8665018928679242E-05, 2.3686576883603073E-03, 7.8665018928764622E-05, -1.5108662980936485E-03, -1.7130036080580737E-04, 3.3386109283452861E-04, 8.2405696428180703E-05, 2.9080869014384429E-06}; - constexpr FLT c5[] = {1.3873038503072801E-06, 1.8694798962849948E-05, 1.4885937076477316E-05, -1.3109520271106624E-04, -4.6797213058790025E-05, 3.2555441892430825E-04, 6.5502537691746230E-17, -3.2555441892416048E-04, 4.6797213058875582E-05, 1.3109520271106819E-04, -1.4885937076477316E-05, -1.8694798962849962E-05, -1.3873038503072801E-06}; - constexpr FLT c6[] = {4.5216719173889445E-07, 2.3203195635245624E-06, -6.0547210914038460E-06, -1.2111482379340961E-05, 3.0238388566383385E-05, 1.0632529352081665E-05, -5.0954659549722746E-05, 1.0632529352250802E-05, 3.0238388566313227E-05, -1.2111482379347288E-05, -6.0547210914040671E-06, 2.3203195635247352E-06, 4.5216719173889350E-07}; - constexpr FLT c7[] = {9.7956192761412821E-08, 9.2080334896449358E-09, -1.2031586234326618E-06, 1.3860784486076025E-06, 2.8079238803293383E-06, -5.6034103145907796E-06, 1.6113788341939994E-17, 5.6034103146040687E-06, -2.8079238803054550E-06, -1.3860784485997179E-06, 1.2031586234342167E-06, -9.2080334898128650E-09, -9.7956192761411458E-08}; - constexpr FLT c8[] = {1.2350515865275843E-08, -4.7668301905167552E-08, -3.2637845350597966E-08, 3.2101904613347501E-07, -3.3650826994957826E-07, -3.1117289066304045E-07, 7.8771611535813792E-07, -3.1117289069990237E-07, -3.3650826984246136E-07, 3.2101904612282309E-07, -3.2637845349600439E-08, -4.7668301904853071E-08, 1.2350515865276535E-08}; - constexpr FLT c9[] = {2.7912946705592266E-10, -6.8584366111657433E-09, 1.5876438439662156E-08, 2.2894800381734934E-09, -5.4355139631893104E-08, 6.9215572156100812E-08, 1.6320619156148685E-17, -6.9215572241906639E-08, 5.4355139637428967E-08, -2.2894800215659153E-09, -1.5876438439575659E-08, 6.8584366109657170E-09, -2.7912946705524691E-10}; - constexpr FLT c10[] = {-1.9473100882503891E-10, -6.0076128424585684E-11, 1.8131864354130518E-09, -3.9994904462490394E-09, 2.0334605597831887E-09, 5.0274131974512103E-09, -9.3367591026663196E-09, 5.0274136044049357E-09, 2.0334605333861501E-09, -3.9994904745315308E-09, 1.8131864358844393E-09, -6.0076128154532669E-11, -1.9473100882561411E-10}; - constexpr FLT c11[] = {-2.9813639427701670E-11, 8.8416967305832406E-11, -6.1944900155883343E-11, -2.3424446318938161E-10, 6.6123632509207570E-10, -6.5395825305270265E-10, -7.6394712006965382E-17, 6.5395802534269801E-10, -6.6123633886256970E-10, 2.3424448263843040E-10, 6.1944899055662456E-11, -8.8416967554269098E-11, 2.9813639428048382E-11}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.0213002307223062E-07, 5.7528591418445639E-05, 2.5031206020280088E-03, 3.2405046511689233E-02, 1.8485678142025513E-01, 5.5177865704975304E-01, 9.3670793123951734E-01, 9.3670793123951712E-01, 5.5177865704975315E-01, 1.8485678142025547E-01, 3.2405046511689239E-02, 2.5031206020280179E-03, 5.7528591418445801E-05, 1.0213002307242253E-07}; - constexpr FLT c1[] = {4.6718564624239767E-07, 1.3360375098030156E-04, 3.8410346178215306E-03, 3.4207779106833425E-02, 1.2923501383683489E-01, 2.2132894130184291E-01, 1.2264779624530273E-01, -1.2264779624530257E-01, -2.2132894130184308E-01, -1.2923501383683503E-01, -3.4207779106833425E-02, -3.8410346178215393E-03, -1.3360375098030178E-04, -4.6718564624220264E-07}; - constexpr FLT c2[] = {9.3810713124204527E-07, 1.3926941499858519E-04, 2.5833386162539013E-03, 1.4797516242328850E-02, 3.0361769467151970E-02, 5.7261067343619262E-03, -5.3608938764866873E-02, -5.3608938764866894E-02, 5.7261067343618603E-03, 3.0361769467151870E-02, 1.4797516242328836E-02, 2.5833386162539061E-03, 1.3926941499858543E-04, 9.3810713124224814E-07}; - constexpr FLT c3[] = {1.0954436997682021E-06, 8.5568590196649221E-05, 9.7778250562911601E-04, 3.0692948752812804E-03, 6.0463237460738756E-04, -8.9532302111318181E-03, -7.4040784665309846E-03, 7.4040784665312838E-03, 8.9532302111319968E-03, -6.0463237460737487E-04, -3.0692948752812708E-03, -9.7778250562911818E-04, -8.5568590196649329E-05, -1.0954436997680333E-06}; - constexpr FLT c4[] = {8.3014334976692641E-07, 3.4045323043173900E-05, 2.1660980714121239E-04, 1.7421792587401689E-04, -9.2118064021561887E-04, -9.7597008655075522E-04, 1.4714477548413631E-03, 1.4714477548414121E-03, -9.7597008655073809E-04, -9.2118064021559762E-04, 1.7421792587402266E-04, 2.1660980714121363E-04, 3.4045323043173968E-05, 8.3014334976713224E-07}; - constexpr FLT c5[] = {4.3045614796951587E-07, 8.9716871724550274E-06, 2.3377513570381849E-05, -5.5213296993546423E-05, -1.2391624765752083E-04, 1.5869855385555775E-04, 2.1530382494154427E-04, -2.1530382494144317E-04, -1.5869855385557331E-04, 1.2391624765755973E-04, 5.5213296993542533E-05, -2.3377513570381968E-05, -8.9716871724550325E-06, -4.3045614796933747E-07}; - constexpr FLT c6[] = {1.5611302559652642E-07, 1.4859455506706785E-06, -8.5826557923722616E-07, -1.1616353402592630E-05, 8.0333594878995593E-06, 2.8616079443375728E-05, -2.5816776957707699E-05, -2.5816776957707652E-05, 2.8616079443268301E-05, 8.0333594878977314E-06, -1.1616353402591744E-05, -8.5826557923811989E-07, 1.4859455506706314E-06, 1.5611302559670737E-07}; - constexpr FLT c7[] = {3.9336515129721532E-08, 1.1257285216182540E-07, -6.2406181937560562E-07, -2.6873173855233150E-07, 2.8292088258393860E-06, -1.4598715516905790E-06, -4.0212462690723253E-06, 4.0212462691823422E-06, 1.4598715517761175E-06, -2.8292088259133913E-06, 2.6873173855647969E-07, 6.2406181937648769E-07, -1.1257285216174059E-07, -3.9336515129545720E-08}; - constexpr FLT c8[] = {6.5041263396088790E-09, -9.9149367808853263E-09, -6.6845758889620994E-08, 1.6286641992901855E-07, 5.8507874943424797E-08, -4.7688540978638226E-07, 3.2559878511421460E-07, 3.2559878519979701E-07, -4.7688540972525423E-07, 5.8507875026096430E-08, 1.6286641993325022E-07, -6.6845758889870313E-08, -9.9149367809131923E-09, 6.5041263397795280E-09}; - constexpr FLT c9[] = {5.5138523621090170E-10, -3.4792607432658830E-09, 2.1621109687111844E-09, 1.6802313210571416E-08, -3.4440501484206901E-08, 3.6408051867813727E-09, 5.4274262350067578E-08, -5.4274262322388281E-08, -3.6408052006210212E-09, 3.4440501481438969E-08, -1.6802313213339344E-08, -2.1621109679759532E-09, 3.4792607432902108E-09, -5.5138523606396516E-10}; - constexpr FLT c10[] = {-2.3785683828448576E-11, -2.9453404124114860E-10, 1.0997757897423152E-09, -8.6020468987368310E-10, -2.2974592934948612E-09, 5.5064437603692059E-09, -3.1470905819229834E-09, -3.1470905272434506E-09, 5.5064436867561607E-09, -2.2974592840673907E-09, -8.6020468484567061E-10, 1.0997757884067548E-09, -2.9453404129270796E-10, -2.3785683688822786E-11}; - constexpr FLT c11[] = {-1.2240623323339709E-11, 1.4269095096874458E-11, 6.3689195980296716E-11, -2.3523039255622989E-10, 2.6546832331592691E-10, 9.4137182189250380E-11, -5.6473803777133577E-10, 5.6473799518218520E-10, -9.4137157913436917E-11, -2.6546835890448598E-10, 2.3523039312408576E-10, -6.3689194329967738E-11, -1.4269094997055950E-11, 1.2240623457297303E-11}; - constexpr FLT c12[] = {-1.4791529085565623E-12, 4.8147158180813514E-12, -7.1247159181258048E-12, -3.7363568005007135E-12, 3.0923958877552072E-11, -4.7998366007614543E-11, 2.4268802632733111E-11, 2.4268880217882715E-11, -4.7998325173324774E-11, 3.0923998690985708E-11, -3.7363589698227313E-12, -7.1247171622956968E-12, 4.8147157313484649E-12, -1.4791527915262285E-12}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.3183302143948793E-08, 1.7202745817468655E-05, 9.2668857465754784E-04, 1.4607490553401936E-02, 1.0130044556641116E-01, 3.7041488405244677E-01, 7.8279781886019206E-01, 1.0000000000000018E+00, 7.8279781886019228E-01, 3.7041488405244727E-01, 1.0130044556641139E-01, 1.4607490553401959E-02, 9.2668857465754882E-04, 1.7202745817468652E-05, 2.3183302143948763E-08}; - constexpr FLT c1[] = {1.1019919454791572E-07, 4.1938159428224126E-05, 1.5154850601194973E-03, 1.6839357628952684E-02, 8.0835952724673255E-02, 1.8739074372244105E-01, 1.9255567517255739E-01, -9.4204294746769593E-32, -1.9255567517255723E-01, -1.8739074372244108E-01, -8.0835952724673352E-02, -1.6839357628952709E-02, -1.5154850601194973E-03, -4.1938159428224126E-05, -1.1019919454791572E-07}; - constexpr FLT c2[] = {2.3137327105312791E-07, 4.6266060425611204E-05, 1.1028009511991974E-03, 8.2352859806754802E-03, 2.4233386066663413E-02, 2.2182889945939449E-02, -2.5327411650384993E-02, -6.0946897479642256E-02, -2.5327411650385129E-02, 2.2182889945939359E-02, 2.4233386066663424E-02, 8.2352859806754854E-03, 1.1028009511991970E-03, 4.6266060425611204E-05, 2.3137327105312783E-07}; - constexpr FLT c3[] = {2.8457821671573274E-07, 3.0427184404092299E-05, 4.6337319534911844E-04, 2.1072304367244932E-03, 2.4342755210407531E-03, -4.2814200474568563E-03, -9.6703299158782657E-03, 1.8176153030403361E-16, 9.6703299158783507E-03, 4.2814200474569379E-03, -2.4342755210407076E-03, -2.1072304367244859E-03, -4.6337319534911817E-04, -3.0427184404092296E-05, -2.8457821671573279E-07}; - constexpr FLT c4[] = {2.2919642176438702E-07, 1.3183839322480003E-05, 1.2030953406839325E-04, 2.4905754342428421E-04, -3.4193403196993951E-04, -1.1551611179404738E-03, 2.1954335627567210E-04, 1.7895433812201793E-03, 2.1954335627571010E-04, -1.1551611179404326E-03, -3.4193403196995387E-04, 2.4905754342428610E-04, 1.2030953406839360E-04, 1.3183839322480008E-05, 2.2919642176438720E-07}; - constexpr FLT c5[] = {1.2779800356186583E-07, 3.8997040140349313E-06, 1.8264189394307498E-05, -8.3632912035128204E-06, -1.0687544349164653E-04, 2.2123224044726536E-06, 2.3404180714514772E-04, 6.5064979845545577E-17, -2.3404180714503106E-04, -2.2123224042782134E-06, 1.0687544349166598E-04, 8.3632912035006689E-06, -1.8264189394307559E-05, -3.8997040140349338E-06, -1.2779800356186589E-07}; - constexpr FLT c6[] = {5.0693377499403691E-08, 7.7594237801400426E-07, 9.4933483676717755E-07, -6.6987818302423087E-06, -4.5889941143373546E-06, 2.2647907184667538E-05, 3.7412856035449417E-06, -3.3754692339426772E-05, 3.7412856034892404E-06, 2.2647907184654951E-05, -4.5889941143014083E-06, -6.6987818302351157E-06, 9.4933483676684456E-07, 7.7594237801399991E-07, 5.0693377499403691E-08}; - constexpr FLT c7[] = {1.4373673262756881E-08, 9.2554419735729795E-08, -2.0417866965615742E-07, -6.8820764686271727E-07, 1.4165168644096691E-06, 1.2531774951198972E-06, -3.6383191328570317E-06, 5.9333697238861927E-17, 3.6383191329076855E-06, -1.2531774952992520E-06, -1.4165168643945163E-06, 6.8820764685908223E-07, 2.0417866965620961E-07, -9.2554419735731158E-08, -1.4373673262756913E-08}; - constexpr FLT c8[] = {2.8405432421064598E-09, 2.6648052024128211E-09, -4.5328290134778586E-08, 3.2089634828694367E-08, 1.7241593348808383E-07, -2.5816631656161770E-07, -1.3664009513726493E-07, 4.6017883216168089E-07, -1.3664009510064915E-07, -2.5816631656773852E-07, 1.7241593343152281E-07, 3.2089634835965337E-08, -4.5328290134523662E-08, 2.6648052024185691E-09, 2.8405432421065198E-09}; - constexpr FLT c9[] = {3.5447644664522991E-10, -1.1390658479562114E-09, -2.4324028601311552E-09, 1.2152005527725076E-08, -7.1102518341828894E-09, -2.5878341862165437E-08, 4.0855407178225425E-08, -6.7229636689436406E-18, -4.0855407139474409E-08, 2.5878341989490202E-08, 7.1102518840056246E-09, -1.2152005535163887E-08, 2.4324028601311552E-09, 1.1390658479600971E-09, -3.5447644664517713E-10}; - constexpr FLT c10[] = {1.6106092880607926E-11, -1.9612809866225313E-10, 3.3667881388500915E-10, 5.4740705815843633E-10, -2.3219918220819429E-09, 1.8783264389538617E-09, 2.1531915835821252E-09, -4.8374637778167195E-09, 2.1531915732119103E-09, 1.8783264455530896E-09, -2.3219918255386980E-09, 5.4740706350069505E-10, 3.3667881394392907E-10, -1.9612809866164026E-10, 1.6106092880601619E-11}; - constexpr FLT c11[] = {-2.9809392328002639E-12, -8.3268200084267327E-12, 5.7687950483526562E-11, -9.1929198156856840E-11, -3.9289938224686938E-11, 3.0713724621937891E-10, -3.5332675603861928E-10, -4.7176615708722248E-17, 3.5332675632254561E-10, -3.0713734445835836E-10, 3.9289964949381516E-11, 9.1929194004414145E-11, -5.7687950660981567E-11, 8.3268199995541140E-12, 2.9809392327699276E-12}; - constexpr FLT c12[] = {-6.7275763613050405E-13, 1.4037883809519618E-12, 1.0122748224833392E-12, -1.0507010409950668E-11, 1.9186635811522471E-11, -7.9758147674463026E-12, -2.2999207389706864E-11, 4.0853090072343795E-11, -2.2999199222849929E-11, -7.9758923525966314E-12, 1.9186574560087790E-11, -1.0507007219772089E-11, 1.0122747905815843E-12, 1.4037883779612130E-12, -6.7275763610714771E-13}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {5.2012152104084075E-09, 5.0291159580938685E-06, 3.3201112337137914E-04, 6.3015433246683345E-03, 5.2427915343763419E-02, 2.3104762006593382E-01, 5.9521037322997228E-01, 9.4441119081353919E-01, 9.4441119081353897E-01, 5.9521037322997228E-01, 2.3104762006593382E-01, 5.2427915343763426E-02, 6.3015433246683362E-03, 3.3201112337137925E-04, 5.0291159580938685E-06, 5.2012152104083968E-09}; - constexpr FLT c1[] = {2.5620581163903698E-08, 1.2815874111792785E-05, 5.7471335914300648E-04, 7.8386860177525539E-03, 4.6638901641906975E-02, 1.3897554029141568E-01, 2.0773808644544139E-01, 1.0813440420918323E-01, -1.0813440420918335E-01, -2.0773808644544151E-01, -1.3897554029141571E-01, -4.6638901641906962E-02, -7.8386860177525539E-03, -5.7471335914300648E-04, -1.2815874111792780E-05, -2.5620581163903678E-08}; - constexpr FLT c2[] = {5.6049296769722407E-08, 1.4879146623074265E-05, 4.4787865139353408E-04, 4.2383440773521713E-03, 1.6624620601556200E-02, 2.6395394769117682E-02, 3.6740117889108559E-04, -4.8088574473126838E-02, -4.8088574473126817E-02, 3.6740117889110039E-04, 2.6395394769117647E-02, 1.6624620601556183E-02, 4.2383440773521705E-03, 4.4787865139353381E-04, 1.4879146623074262E-05, 5.6049296769722367E-08}; - constexpr FLT c3[] = {7.2283166867263369E-08, 1.0391634193778174E-05, 2.0529674430143886E-04, 1.2618687081127949E-03, 2.6256301814801142E-03, -5.5040645592548403E-04, -7.8709464111364428E-03, -5.7657980103485666E-03, 5.7657980103488684E-03, 7.8709464111365764E-03, 5.5040645592556046E-04, -2.6256301814800891E-03, -1.2618687081127923E-03, -2.0529674430143870E-04, -1.0391634193778174E-05, -7.2283166867263382E-08}; - constexpr FLT c4[] = {6.1501023800531295E-08, 4.8443034242391149E-06, 6.0167136036954489E-05, 2.0573318254801955E-04, 1.2811955521425743E-05, -8.3782209201439741E-04, -6.2669687707126603E-04, 1.1809008871739588E-03, 1.1809008871740102E-03, -6.2669687707129801E-04, -8.3782209201439957E-04, 1.2811955521424802E-05, 2.0573318254801969E-04, 6.0167136036954442E-05, 4.8443034242391132E-06, 6.1501023800531308E-08}; - constexpr FLT c5[] = {3.6571939291734573E-08, 1.5742222553115388E-06, 1.1217451065775747E-05, 1.0668471374318139E-05, -6.0694020243058218E-05, -7.4268888177597524E-05, 1.3567546096387106E-04, 1.4875477215044619E-04, -1.4875477215041898E-04, -1.3567546096383994E-04, 7.4268888177628640E-05, 6.0694020243062108E-05, -1.0668471374318139E-05, -1.1217451065775808E-05, -1.5742222553115373E-06, -3.6571939291734560E-08}; - constexpr FLT c6[] = {1.5672684443241293E-08, 3.5812571134853537E-07, 1.1292168823203332E-06, -2.5215449854185100E-06, -7.6275609266365118E-06, 9.3973092319789718E-06, 1.7891569285072030E-05, -1.8642776809419116E-05, -1.8642776809435267E-05, 1.7891569285119396E-05, 9.3973092319861496E-06, -7.6275609266374249E-06, -2.5215449854180577E-06, 1.1292168823202796E-06, 3.5812571134853394E-07, 1.5672684443241266E-08}; - constexpr FLT c7[] = {4.8970459380161511E-09, 5.4304148291621772E-08, -1.0066736763205116E-08, -5.3239387743771190E-07, 2.2987809872388434E-07, 1.8048974519458305E-06, -1.3449315565530231E-06, -2.4760016203656832E-06, 2.4760016205558345E-06, 1.3449315566530894E-06, -1.8048974519264694E-06, -2.2987809871496018E-07, 5.3239387743957950E-07, 1.0066736763205477E-08, -5.4304148291620039E-08, -4.8970459380161527E-09}; - constexpr FLT c8[] = {1.1055703983904693E-09, 4.3691209554215673E-09, -2.0201061499499309E-08, -2.3275033898522544E-08, 1.2633562932172848E-07, -2.2021804055583841E-08, -2.7912172397333448E-07, 2.1280289571270167E-07, 2.1280289561471954E-07, -2.7912172398563377E-07, -2.2021804043311624E-08, 1.2633562932175524E-07, -2.3275033897953490E-08, -2.0201061499405642E-08, 4.3691209554208717E-09, 1.1055703983904937E-09}; - constexpr FLT c9[] = {1.7210848751142109E-10, -1.3819378018358974E-10, -2.4707116696395418E-09, 4.6626394240840718E-09, 6.2513494821407377E-09, -2.2225751663756647E-08, 7.2716681831167356E-09, 2.9914504875425248E-08, -2.9914504880961111E-08, -7.2716681858846656E-09, 2.2225751666524578E-08, -6.2513494807567727E-09, -4.6626394246030589E-09, 2.4707116695638564E-09, 1.3819378018734865E-10, -1.7210848751139469E-10}; - constexpr FLT c10[] = {1.5548426850891040E-11, -8.2967690037353030E-11, -2.0776280196441915E-11, 6.5818716237227360E-10, -9.7473365318544434E-10, -7.2114132190269774E-10, 2.9974008768194548E-09, -1.8729406654385533E-09, -1.8729407980520035E-09, 2.9974009543459026E-09, -7.2114130179071973E-10, -9.7473365601368880E-10, 6.5818716417921449E-10, -2.0776280166982969E-11, -8.2967690036279040E-11, 1.5548426850876794E-11}; - constexpr FLT c11[] = {1.7715918253734007E-14, -8.7094275492396390E-12, 2.5402078548167017E-11, 5.6643084712743339E-13, -1.1273398069226705E-10, 1.7831197627554656E-10, 2.2124056737037060E-13, -2.7985821416111004E-10, 2.7985826569398559E-10, -2.2122821651802181E-13, -1.7831199885666961E-10, 1.1273397622040666E-10, -5.6643203607501166E-13, -2.5402078628021660E-11, 8.7094275492396907E-12, -1.7715918256992908E-14}; - constexpr FLT c12[] = {-2.1496737418348056E-13, -2.2214973543773537E-14, 2.3291735079229971E-12, -5.9732922869516132E-12, 3.0556730493177866E-12, 1.1858129781605648E-11, -2.4316397039401376E-11, 1.3235569405286772E-11, 1.3235463236132106E-11, -2.4316413373117597E-11, 1.1858131823320733E-11, 3.0556730493176707E-12, -5.9732919041302971E-12, 2.3291735916652542E-12, -2.2214974665309464E-14, -2.1496737416109420E-13}; - constexpr FLT c13[] = {-2.3198933254093550E-14, 8.4680085604099498E-14, -5.5120431569756550E-14, -3.4224865085091971E-13, 1.0093479536840142E-12, -9.9670676529397927E-13, -4.1953479545762892E-13, 2.1120282165025634E-12, -2.1120647150379602E-12, 4.1949829692223215E-13, 9.9668454879417257E-13, -1.0093487471304360E-12, 3.4224795658530073E-13, 5.5120400575755698E-14, -8.4680084102827573E-14, 2.3198933260903755E-14}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 7fd098925..d5009de41 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -56,7 +56,7 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) // if spreading/FT careful, shouldn't need this if, but causes no speed hit return 0.0; else - return exp(T(opts.ES_beta) * sqrt(T(1.0) - T(opts.ES_c) * x * x)); + return exp((T)opts.ES_beta * (sqrt((T)1.0 - (T)opts.ES_c * x * x) - (T)1.0)); } template @@ -71,7 +71,9 @@ static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int This is the "reference implementation", used by eg common/onedim_* 2/17/17 */ { - return abs(x) < ns / 2.0 ? exp(es_beta * (sqrt(1.0 - es_c * x * x))) : 0.0; + return abs(x) < ns / T(2.0) + ? exp((T)es_beta * (sqrt((T)1.0 - (T)es_c * x * x) - (T)1.0)) + : 0.0; } template @@ -82,23 +84,17 @@ static __inline__ __device__ void eval_kernel_vec_horner(T *ker, const T x, cons This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { -#ifdef __CUDA_ARCH__ - __builtin_assume(w >= 2); - if constexpr (std::is_same_v) { - __builtin_assume(w <= 7); - } - if constexpr (std::is_same_v) { - __builtin_assume(w <= 16); - } -#endif const auto z = fma(T(2), x, T(w - 1)); // scale so local grid offset z in [-1,1] // T z = 2 * x + w - 1.0; // insert the auto-generated code which expects z, w args, writes to ker... if (upsampfac == 2.0) { // floating point equality is fine here - using FLT = T; - using CUFINUFFT_FLT = T; + using FLT = T; #include "cufinufft/contrib/ker_horner_allw_loop.inc" } + if (upsampfac == 1.25) { // floating point equality is fine here + using FLT = T; +#include "cufinufft/contrib/ker_lowupsampfac_horner_allw_loop.inc" + } } template From ae783da138028538738616332675c0da73b5bb1c Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 24 Jul 2024 17:31:22 -0400 Subject: [PATCH 21/39] picked good defaults for method --- CMakeLists.txt | 4 ++-- examples/CMakeLists.txt | 4 ---- include/cufinufft/impl.h | 29 ++++++++++++++--------------- perftest/cuda/bench.py | 10 ++++++---- src/cuda/3d/spread3d_wrapper.cu | 1 + src/cuda/common.cu | 15 ++++++--------- 6 files changed, 29 insertions(+), 34 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 93a34f2af..3c9b84f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -271,7 +271,7 @@ if (FINUFFT_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING OR FINUFFT_BUILD_TESTS) + if (FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) add_subdirectory(test/cuda) endif () @@ -280,7 +280,7 @@ if (FINUFFT_USE_CUDA) endif () # Add tests defined in their own directory -if (FINUFFT_USE_CPU AND (BUILD_TESTING OR FINUFFT_BUILD_TESTS)) +if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) add_subdirectory(test) add_subdirectory(perftest) endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index af6f067bc..8b5afa4f5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,7 +21,3 @@ if(FINUFFT_USE_OPENMP) enable_asan(${EXAMPLE}) endforeach() endif() - -if (FINUFFT_USE_CUDA) - add_subdirectory(cuda) -endif() diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 4a1c6ae31..7d63df51e 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -144,24 +144,23 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * For type 2, we always default to method 1 (GM). */ // query the device for the amount of shared memory available - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, - device_id); - RETURN_IF_CUDA_ERROR - // compute the amount of shared memory required for the method - const auto shared_mem_required = - shared_memory_required(dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - printf("Shared memory available: %d KB, required: %d KB\n", shared_mem_per_block, - shared_mem_required); - if ((shared_mem_required > shared_mem_per_block)) { + if (dim == 3 && std::is_same_v) { d_plan->opts.gpu_method = 1; - printf("choosing method 1\n"); } else { - d_plan->opts.gpu_method = 2; - printf("choosing method 2\n"); + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + RETURN_IF_CUDA_ERROR + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } } - printf("using method %d\n", d_plan->opts.gpu_method); } int fftsign = (iflag >= 0) ? 1 : -1; diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 8a9e757a3..aa21acd52 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,7 +37,7 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "f", +args = {"--prec": "d", "--n_runs": "5", "--method": "0", "--sort": "1", @@ -71,8 +71,10 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -for i in range(1, 7): - args["--tol"] = "1E-" + str(i) +max_range = 8 if args["--prec"] == "d" else 7 + +for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) print("Running with tol = 1E-" + str(i)) for method in ['2', '1']: args["--method"] = method @@ -180,4 +182,4 @@ def build_args(args): plt.savefig("bench.png") plt.savefig("bench.svg") plt.savefig("bench.pdf") -plt.show() \ No newline at end of file +plt.show() diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index bf78ed905..4fb2b073d 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -280,6 +280,7 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; + blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; ghost_bin_pts_index<<>>( diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 64c5639dc..ea54a4c77 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -256,11 +256,15 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } + // use half of the available shared memory if double precision + if constexpr (std::is_same_v) { + shared_mem_per_block /= 2; + } const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - // find the power of 2 that is less than bin_size - // this makes the bin_size use the maximum shared memory available + opts->gpu_binsizex = bin_size; + opts->gpu_binsizex = 1024; const auto shared_mem_required = shared_memory_required( dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", @@ -310,13 +314,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { opts->gpu_binsizex = 16; opts->gpu_binsizey = 16; opts->gpu_binsizez = 2; - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez); - // printf( - // "binsizex: %d, binsizey: %d, binsizez: %d shared_mem_required %ld - // (bytes)\n", opts->gpu_binsizex, opts->gpu_binsizey, - // opts->gpu_binsizez, shared_mem_required); } } break; case 4: { From d29fcf517d930ba9a99a24b7d245f01461d6b1d6 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 24 Jul 2024 19:12:35 -0400 Subject: [PATCH 22/39] update configuration --- perftest/cuda/bench.py | 4 ++-- src/cuda/common.cu | 8 +------- 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index aa21acd52..118c04d3b 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -44,7 +44,7 @@ def build_args(args): # "--N1": "16777216", "--N1": "256", "--N2": "256", - "--N3": "256", + # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", "--tol": "1E-6"} @@ -71,7 +71,7 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -max_range = 8 if args["--prec"] == "d" else 7 +max_range = 16 if args["--prec"] == "d" else 7 for i in range(1, max_range): args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) diff --git a/src/cuda/common.cu b/src/cuda/common.cu index ea54a4c77..8499aea8a 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -263,13 +263,7 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - opts->gpu_binsizex = bin_size; - opts->gpu_binsizex = 1024; - const auto shared_mem_required = shared_memory_required( - dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); - // printf("binsizex: %d, shared_mem_required %ld (bytes)\n", - // opts->gpu_binsizex, - // shared_mem_required); + opts->gpu_binsizex = bin_size; } opts->gpu_binsizey = 1; opts->gpu_binsizez = 1; From 73f937b0afcf9a66313bf2607d3086504d5061e7 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 10:22:59 -0400 Subject: [PATCH 23/39] upated build system --- CMakeLists.txt | 17 ++-- perftest/cuda/CMakeLists.txt | 6 ++ perftest/cuda/bench.py | 185 +++++++++++++++++++++++++++++++++++ perftest/cuda/bench.sh | 13 +++ src/cuda/CMakeLists.txt | 39 ++++++-- 5 files changed, 244 insertions(+), 16 deletions(-) create mode 100644 perftest/cuda/bench.py create mode 100644 perftest/cuda/bench.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ca851dfe..3c9b84f3b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.19) +cmake_minimum_required(VERSION 3.23) project(FINUFFT VERSION 2.2.0 LANGUAGES C CXX) @@ -46,7 +46,7 @@ if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS AND NOT DEFINED FINUFFT_ARC endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") - +set(FINUFFT_CUDA_ARCHITECTURES "native" CACHE STRING "CUDA architectures to build for (e.g. 60;70;75;)") # All options go here # sphinx tag (don't remove): @cmake_opts_start option(FINUFFT_BUILD_EXAMPLES "Whether to build the FINUFFT examples" OFF) @@ -271,25 +271,26 @@ if (FINUFFT_USE_CUDA) enable_language(CUDA) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) - if (BUILD_TESTING AND FINUFFT_BUILD_TESTS) + if (FINUFFT_BUILD_TESTS) add_subdirectory(perftest/cuda) + add_subdirectory(test/cuda) endif () list(APPEND INSTALL_TARGETS cufinufft) endif () # Add tests defined in their own directory -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CPU) +if (FINUFFT_USE_CPU AND FINUFFT_BUILD_TESTS) add_subdirectory(test) add_subdirectory(perftest) endif () -if (BUILD_TESTING AND FINUFFT_BUILD_TESTS AND FINUFFT_USE_CUDA) - add_subdirectory(test/cuda) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_CPU) + add_subdirectory(examples) endif () -if (FINUFFT_BUILD_EXAMPLES) - add_subdirectory(examples) +if (FINUFFT_BUILD_EXAMPLES AND FINUFFT_USE_GPU) + add_subdirectory(examples/cuda) endif () if (FINUFFT_BUILD_FORTRAN) diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 9d817d5f6..8f8a8a20b 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,3 +1,9 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) +set_target_properties(cuperftest PROPERTIES + LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} +) + +#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py new file mode 100644 index 000000000..1d52ff884 --- /dev/null +++ b/perftest/cuda/bench.py @@ -0,0 +1,185 @@ +import matplotlib.pyplot as plt +import os +import subprocess +import pandas as pd +import numpy as np +import io +cwd = os.getcwd() + + +# function that runs a command line command and returns the output +# it also takes a list of arguments to pass to the command +def run_command(command, args): + # convert command and args to a string + try: + cmd = [command] + args + print("Running command:", ' '.join(cmd)) + result = subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + return result.stdout, result.stderr + except subprocess.CalledProcessError as e: + print('stdout output:\n', e.stdout) + print('stderr output:\n', e.stderr) + print("Error executing command:", e) + + +# function that builds a string from a dictionary of arguments + +def build_args(args): + args_list = [] + for key, value in args.items(): + args_list.append(key) + args_list.append(value) + return args_list + + +# function + +# example command to run: +# nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 +# example arguments +args = {"--prec": "f", + "--n_runs": "5", + "--method": "0", + "--sort": "1", + "--N1": "16777216", + # "--N1": "256", + # "--N2": "256", + # "--N3": "256", + "--kerevalmethod": "1", + "--M": "1E8", + "--tol": "1E-6"} +# iterate over tol from 1E-6 to 1E-1 +data = { + 'method': [], + 'throughput': [], + 'tolerance': [], + # 'setpts': [], + 'exec': [], +} +warmup = {"--prec": "f", + "--n_runs": "1", + "--method": "0", + "--N1": "256", + "--N2": "256", + # "--N3": "256", + "--M": "256", + "--tol": "1E-1"} +cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(warmup) +print("Warmup") +stdout, stderr = run_command("nsys", cmd) +print("Benchmarking") +if stderr != '': + print(stderr) + exit(0) +max_range = 16 if args["--prec"] == "d" else 7 + +for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + args["--method"] = method + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) + # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] + + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') + # drop all the rows with spread not in "Name" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # exit(0) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) + # pt/s + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) + + +df = pd.DataFrame(data) +# Pivot the DataFrame +pivot_df = df.pivot(index='tolerance', columns='method') +# print(pivot_df) +# scale the throughput SM by GM +# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] +# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] +# scale setpts SM by GM +# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] +# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] +# remove the GM column +# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) +pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) +print(pivot_df) +# Plot +pivot_df.plot(kind='bar', figsize=(10, 7)) +# Find the minimum throughput value +min_val = min(pivot_df[('throughput', 'SM')].min(), pivot_df[('throughput', 'GM')].min()) +max_val = max(pivot_df[('throughput', 'SM')].max(), pivot_df[('throughput', 'GM')].max()) +print(min_val, max_val) +plt.ylim(min_val * .90, max_val * 1.1) +# plt.ylim(.8, 1.2) + +# Calculate the smallest power of 10 +# min_pow_10 = 10 ** np.floor(np.log10(min_throughput)) + +# Adjust the plot's y-axis limits +# plt.ylim(df['throughput'].min()*.99, df['throughput'].max() * 1.009) # Adding 10% for upper margin + +# plot an horizontal line at 1 with label "GM" +# plt.axhline(y=1, color='k', linestyle='--', label='GM') +plt.xlabel('Tolerance') +plt.ylabel('Throughput') +plt.title('Throughput by Tolerance and Method') +plt.legend(title='Method') +plt.tight_layout() +plt.show() +plt.xlabel("Tolerance") +plt.ylabel("Points/s") +plt.savefig("bench.png") +plt.savefig("bench.svg") +plt.savefig("bench.pdf") +plt.show() diff --git a/perftest/cuda/bench.sh b/perftest/cuda/bench.sh new file mode 100644 index 000000000..9832e1088 --- /dev/null +++ b/perftest/cuda/bench.sh @@ -0,0 +1,13 @@ +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e1 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e1 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e1 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +./cuperftest --prec d --n_runs 5 --N1 1e2 --N2 1e2 --N3 1e3 --M 2e6 --method 0 --tol 1e-4 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e4 --N2 1e4 --N3 1e4 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e5 --N2 1e5 --N3 1e5 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e6 --N2 1e6 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e7 --N2 1e7 --M 2e6 --method 0 --tol 1e-10 +#./cuperftest --prec d --n_runs 5 --N1 1e8 --N2 1e8 --M 2e6 --method 0 --tol 1e-10 diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index c9f13344d..751ccfc6c 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,8 +1,3 @@ - -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) set(PRECISION_DEPENDENT_SRC @@ -22,13 +17,34 @@ set(CUFINUFFT_INCLUDE_DIRS ) set(CUFINUFFT_INCLUDE_DIRS ${CUFINUFFT_INCLUDE_DIRS} PARENT_SCOPE) +# flush denormals to zero and enable verbose PTXAS output +set(FINUFFT_CUDA_FLAGS + -ftz=true -fmad=true -restrict -Xptxas=-v --extra-device-vectorization -res-usage + -Wdouble-promotion -lineinfo --extended-lambda --expt-relaxed-constexpr +) + add_library(cufinufft_common_objects OBJECT ${PRECISION_INDEPENDENT_SRC}) target_include_directories(cufinufft_common_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_common_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_common_objects PROPERTIES + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) + +target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) +target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) add_library(cufinufft_objects OBJECT ${PRECISION_DEPENDENT_SRC}) target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) -set_property(TARGET cufinufft_objects PROPERTY POSITION_INDEPENDENT_CODE ON) +set_target_properties( + cufinufft_objects PROPERTIES + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) +target_compile_features(cufinufft_objects PRIVATE cxx_std_17) +target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) if (FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED @@ -56,5 +72,12 @@ else () target_link_libraries(cufinufft PUBLIC CUDA::cudart_static CUDA::cufft_static CUDA::nvToolsExt) endif () +target_compile_options(cufinufft PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) file(GLOB CUFINUFFT_PUBLIC_HEADERS "${CMAKE_SOURCE_DIR}/include/cufinufft*.h") -set_target_properties(cufinufft PROPERTIES PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}") +set_target_properties( + cufinufft PROPERTIES + PUBLIC_HEADER "${CUFINUFFT_PUBLIC_HEADERS}" + POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON +) From 07248668e3884d7beae4a60034323aaf1087d6bd Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 15:16:55 -0400 Subject: [PATCH 24/39] fixing jenkins --- include/cufinufft/utils.h | 23 +++++ perftest/cuda/bench.py | 10 +- src/cuda/1d/spreadinterp1d.cuh | 58 ++++++------ src/cuda/2d/spreadinterp2d.cuh | 42 ++++++--- src/cuda/3d/spreadinterp3d.cuh | 161 +++++++++++++++++++-------------- 5 files changed, 185 insertions(+), 109 deletions(-) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index b0a77aec7..29645f9f9 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -81,6 +81,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { return int2{xstart, xend}; } #endif + +// Define a macro to check if NVCC version is >= 11.3 +#if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) +#if (__CUDACC_VER_MAJOR__ > 11) || \ + (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) +#define ALLOCA_SUPPORTED 1 +#else +#define ALLOCA_SUPPORTED 0 +#endif +#else +#define ALLOCA_SUPPORTED 0 +#endif + +#if defined(__CUDA_ARCH__) +#if __CUDA_ARCH__ >= 900 +#define COMPUTE_CAPABILITY_90_OR_HIGHER 1 +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif +#else +#define COMPUTE_CAPABILITY_90_OR_HIGHER 0 +#endif + } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 118c04d3b..1d52ff884 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -37,13 +37,13 @@ def build_args(args): # example command to run: # nsys profile -o cuperftest_profile ./cuperftest --prec f --n_runs 10 --method 1 --N1 256 --N2 256 --N3 256 --M 1E8 --tol 1E-6 # example arguments -args = {"--prec": "d", +args = {"--prec": "f", "--n_runs": "5", "--method": "0", "--sort": "1", - # "--N1": "16777216", - "--N1": "256", - "--N2": "256", + "--N1": "16777216", + # "--N1": "256", + # "--N2": "256", # "--N3": "256", "--kerevalmethod": "1", "--M": "1E8", @@ -60,7 +60,7 @@ def build_args(args): "--n_runs": "1", "--method": "0", "--N1": "256", - # "--N2": "256", + "--N2": "256", # "--N3": "256", "--M": "256", "--tol": "1E-1"} diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index b6c511555..c7d84a9b8 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -23,7 +23,12 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation to reduce stack usage - auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -37,8 +42,8 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, eval_kernel_vec(ker1, x1, ns, es_c, es_beta); for (auto xx = xstart; xx <= xend; xx++) { - auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue = ker1[xx - xstart]; + auto ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue = ker1[xx - xstart]; atomicAdd(&fw[ix].x, cnow.x * kervalue); atomicAdd(&fw[ix].y, cnow.y * kervalue); } @@ -84,11 +89,6 @@ __global__ void calc_inverse_of_global_sort_idx_1d( } } -template -__forceinline__ __device__ cuda_complex mul(const cuda_complex &a, const T b) { - return {a.x * b, a.y * b}; -} - template __global__ void spread_1d_subprob( const T *x, const cuda_complex *c, cuda_complex *fw, int M, uint8_t ns, int nf1, @@ -96,9 +96,8 @@ __global__ void spread_1d_subprob( int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob, int maxsubprobsize, int nbinx, int *idxnupts) { extern __shared__ char sharedbuf[]; - alignas(256) auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; + auto *__restrict__ fwshared = (cuda_complex *)sharedbuf; - int ix; const int subpidx = blockIdx.x; const int bidx = subprob_to_bin[subpidx]; const int binsubp_idx = subpidx - subprobstartpts[bidx]; @@ -109,7 +108,12 @@ __global__ void spread_1d_subprob( const int N = bin_size_x + 2 * ns_2; // dynamic stack allocation - auto ker1 = (T __restrict__ *)alloca(sizeof(T) * ns); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else + T ker1[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -130,9 +134,10 @@ __global__ void spread_1d_subprob( else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); for (int xx = xstart; xx <= xend; xx++) { - ix = xx + ns_2; + const auto ix = xx + ns_2; if (ix >= (bin_size_x + ns_2) || ix < 0) break; - const auto result = mul(cnow, ker1[xx - xstart]); + const cuda_complex result{cnow.x * ker1[xx - xstart], + cnow.y * ker1[xx - xstart]}; atomicAdd(&fwshared[ix].x, result.x); atomicAdd(&fwshared[ix].y, result.y); } @@ -140,7 +145,7 @@ __global__ void spread_1d_subprob( __syncthreads(); /* write to global memory */ for (int k = threadIdx.x; k < N; k += blockDim.x) { - ix = xoffset - ns_2 + k; + auto ix = xoffset - ns_2 + k; if (ix < (nf1 + ns_2)) { ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); atomicAdd(&fw[ix].x, fwshared[k].x); @@ -155,31 +160,32 @@ template __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, T es_c, T es_beta, T sigma, const int *idxnupts) { + // dynamic stack allocation +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; +#else T ker1[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { - T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const T x_rescaled = fold_rescale(x[idxnupts[i]], nf1); + const auto [xstart, xend] = interval(ns, x_rescaled); - int xstart = ceil(x_rescaled - ns / 2.0); - int xend = floor(x_rescaled + ns / 2.0); - cuda_complex cnow; - cnow.x = 0.0; - cnow.y = 0.0; + cuda_complex cnow{0, 0}; - T x1 = (T)xstart - x_rescaled; + const T x1 = (T)xstart - x_rescaled; if constexpr (KEREVALMETH == 1) eval_kernel_vec_horner(ker1, x1, ns, sigma); else eval_kernel_vec(ker1, x1, ns, es_c, es_beta); - for (int xx = xstart; xx <= xend; xx++) { - int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); - T kervalue1 = ker1[xx - xstart]; + int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx); + const T kervalue1 = ker1[xx - xstart]; cnow.x += fw[ix].x * kervalue1; cnow.y += fw[ix].y * kervalue1; } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 62a430ca5..e8a69f303 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -19,9 +19,14 @@ template __global__ void spread_2d_nupts_driven( const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -130,9 +135,14 @@ __global__ void spread_2d_subprob( const auto rounded_ns = ns_2 * 2; const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { fwshared[i] = {0, 0}; @@ -202,9 +212,14 @@ template __global__ void interp_2d_nupts_driven( const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { @@ -236,8 +251,7 @@ __global__ void interp_2d_nupts_driven( cnow.y += fw[inidx].y * kervalue1 * kervalue2; } } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } @@ -252,9 +266,14 @@ __global__ void interp_2d_subprob( extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; +#endif const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -276,12 +295,11 @@ __global__ void interp_2d_subprob( auto ix = xoffset - ns_2 + i; auto iy = yoffset - ns_2 + j; if (ix < (nf1 + ns_2) && iy < (nf2 + ns_2)) { - ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); - const auto outidx = ix + int(iy * nf1); - const auto sharedidx = i + j * (bin_size_x + rounded_ns); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; + ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); + iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); + const auto outidx = ix + iy * nf1; + const auto sharedidx = i + j * (bin_size_x + rounded_ns); + fwshared[sharedidx] = fw[outidx]; } } __syncthreads(); diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index dc722ddc3..19eae72a4 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -4,6 +4,7 @@ #include #include +#include #include #include #include @@ -81,11 +82,16 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, const int *idxnupts) { +#if ALLOCA_SUPPORTED auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; - +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -160,10 +166,16 @@ __global__ void spread_3d_subprob( fwshared[i] = {0, 0}; } __syncthreads(); +#if ALLOCA_SUPPORTED auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = threadIdx.x; i < nupts; i += blockDim.x) { const int nuptsidx = idxnupts[ptstart + i]; @@ -309,85 +321,93 @@ __global__ void spread_3d_block_gather( int nobinx, int nobiny, int nobinz, const int *idxnupts) { extern __shared__ char sharedbuf[]; cuda_complex *fwshared = (cuda_complex *)sharedbuf; + const int subpidx = blockIdx.x; + const int obidx = subprob_to_bin[subpidx]; + const int bidx = obidx * binsperobin; - int xstart, ystart, zstart, xend, yend, zend; - int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew; - int subpidx = blockIdx.x; - int obidx = subprob_to_bin[subpidx]; - int bidx = obidx * binsperobin; - - int obinsubp_idx = subpidx - subprobstartpts[obidx]; - int ix, iy, iz; - int outidx; - int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; - int nupts = min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - - obinsubp_idx * maxsubprobsize); + const int obinsubp_idx = subpidx - subprobstartpts[obidx]; + const int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize; + const int nupts = + min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - + obinsubp_idx * maxsubprobsize); - int xoffset = (obidx % nobinx) * obin_size_x; - int yoffset = (obidx / nobinx) % nobiny * obin_size_y; - int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; + const int xoffset = (obidx % nobinx) * obin_size_x; + const int yoffset = (obidx / nobinx) % nobiny * obin_size_y; + const int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z; - int N = obin_size_x * obin_size_y * obin_size_z; + const int N = obin_size_x * obin_size_y * obin_size_z; +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto *__restrict__ ker1 = ker; + auto *__restrict__ ker2 = ker + ns; + auto *__restrict__ ker3 = ker + ns + ns; +#else T ker1[MAX_NSPREAD]; T ker2[MAX_NSPREAD]; T ker3[MAX_NSPREAD]; - +#endif for (int i = threadIdx.x; i < N; i += blockDim.x) { - fwshared[i].x = 0.0; - fwshared[i].y = 0.0; + fwshared[i] = {0, 0}; } + __syncthreads(); - T x_rescaled, y_rescaled, z_rescaled; - cuda_complex cnow; for (int i = threadIdx.x; i < nupts; i += blockDim.x) { int nidx = idxnupts[ptstart + i]; int b = nidx / M; int box[3]; - for (int d = 0; d < 3; d++) { - box[d] = b % 3; - if (box[d] == 1) box[d] = -1; - if (box[d] == 2) box[d] = 1; + for (int &d : box) { + d = b % 3; + if (d == 1) d = -1; + if (d == 2) d = 1; b = b / 3; } - int ii = nidx % M; - x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; - y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; - z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; - cnow = c[ii]; - - xstart = ceil(x_rescaled - ns / 2.0) - xoffset; - ystart = ceil(y_rescaled - ns / 2.0) - yoffset; - zstart = ceil(z_rescaled - ns / 2.0) - zoffset; - xend = floor(x_rescaled + ns / 2.0) - xoffset; - yend = floor(y_rescaled + ns / 2.0) - yoffset; - zend = floor(z_rescaled + ns / 2.0) - zoffset; + const int ii = nidx % M; + const auto x_rescaled = fold_rescale(x[ii], nf1) + box[0] * nf1; + const auto y_rescaled = fold_rescale(y[ii], nf2) + box[1] * nf2; + const auto z_rescaled = fold_rescale(z[ii], nf3) + box[2] * nf3; + const auto cnow = c[ii]; + auto [xstart, xend] = interval(ns, x_rescaled); + auto [ystart, yend] = interval(ns, y_rescaled); + auto [zstart, zend] = interval(ns, z_rescaled); + + const T x1 = T(xstart) - x_rescaled; + const T y1 = T(ystart) - y_rescaled; + const T z1 = T(zstart) - z_rescaled; + + xstart -= xoffset; + ystart -= yoffset; + zstart -= zoffset; + + xend -= xoffset; + yend -= yoffset; + zend -= zoffset; if constexpr (KEREVALMETH == 1) { - eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma); - eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma); - eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma); + eval_kernel_vec_horner(ker1, x1, ns, sigma); + eval_kernel_vec_horner(ker2, y1, ns, sigma); + eval_kernel_vec_horner(ker3, z1, ns, sigma); } else { - eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta); - eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta); + eval_kernel_vec(ker1, x1, ns, es_c, es_beta); + eval_kernel_vec(ker2, y1, ns, es_c, es_beta); + eval_kernel_vec(ker3, z1, ns, es_c, es_beta); } - xstartnew = xstart < 0 ? 0 : xstart; - ystartnew = ystart < 0 ? 0 : ystart; - zstartnew = zstart < 0 ? 0 : zstart; - xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; - yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; - zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; + const auto xstartnew = xstart < 0 ? 0 : xstart; + const auto ystartnew = ystart < 0 ? 0 : ystart; + const auto zstartnew = zstart < 0 ? 0 : zstart; + const auto xendnew = xend >= obin_size_x ? obin_size_x - 1 : xend; + const auto yendnew = yend >= obin_size_y ? obin_size_y - 1 : yend; + const auto zendnew = zend >= obin_size_z ? obin_size_z - 1 : zend; for (int zz = zstartnew; zz <= zendnew; zz++) { - T kervalue3 = ker3[zz - zstart]; + const T kervalue3 = ker3[zz - zstart]; for (int yy = ystartnew; yy <= yendnew; yy++) { - T kervalue2 = ker2[yy - ystart]; + const T kervalue2 = ker2[yy - ystart]; for (int xx = xstartnew; xx <= xendnew; xx++) { - outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; - T kervalue1 = ker1[xx - xstart]; + const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; + const T kervalue1 = ker1[xx - xstart]; atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); } @@ -401,10 +421,10 @@ __global__ void spread_3d_block_gather( int j = (n / obin_size_x) % obin_size_y; int k = n / (obin_size_x * obin_size_y); - ix = xoffset + i; - iy = yoffset + j; - iz = zoffset + k; - outidx = ix + iy * nf1 + iz * nf1 * nf2; + const auto ix = xoffset + i; + const auto iy = yoffset + j; + const auto iz = zoffset + k; + const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; atomicAdd(&fw[outidx].x, fwshared[n].x); atomicAdd(&fw[outidx].y, fwshared[n].y); } @@ -416,10 +436,16 @@ template __global__ void interp_3d_nupts_driven( const T *x, const T *y, const T *z, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma, int *idxnupts) { - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) { const auto x_rescaled = fold_rescale(x[idxnupts[i]], nf1); @@ -461,8 +487,7 @@ __global__ void interp_3d_nupts_driven( } } } - c[idxnupts[i]].x = cnow.x; - c[idxnupts[i]].y = cnow.y; + c[idxnupts[i]] = cnow; } } @@ -478,10 +503,16 @@ __global__ void interp_3d_subprob( extern __shared__ char sharedbuf[]; auto fwshared = (cuda_complex *)sharedbuf; - auto ker = (T *)alloca(sizeof(T) * ns * 2); +#if ALLOCA_SUPPORTED + auto ker = (T *)alloca(sizeof(T) * ns * 3); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; auto *__restrict__ ker3 = ker + ns + ns; +#else + T ker1[MAX_NSPREAD]; + T ker2[MAX_NSPREAD]; + T ker3[MAX_NSPREAD]; +#endif const auto subpidx = blockIdx.x; const auto bidx = subprob_to_bin[subpidx]; @@ -514,8 +545,7 @@ __global__ void interp_3d_subprob( const auto outidx = ix + iy * nf1 + iz * nf1 * nf2; int sharedidx = i + j * (bin_size_x + rounded_ns) + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - fwshared[sharedidx].x = fw[outidx].x; - fwshared[sharedidx].y = fw[outidx].y; + fwshared[sharedidx] = fw[outidx]; } } __syncthreads(); @@ -569,8 +599,7 @@ __global__ void interp_3d_subprob( } } } - c[idxnupts[idx]].x = cnow.x; - c[idxnupts[idx]].y = cnow.y; + c[idxnupts[idx]] = cnow; } } From 8cd50fc3eb4946179f865b60997922ff38207152 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 15:18:37 -0400 Subject: [PATCH 25/39] using cuda 11.2 --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f042f7749..e5e76cf06 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -9,7 +9,7 @@ pipeline { stage('main') { agent { dockerfile { - filename 'tools/cufinufft/docker/cuda12.0/Dockerfile-x86_64' + filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64' args '--gpus 2' label 'v100' } From 49a9d7eed481953cfa54cbae34b0ab5fb2052237 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 17:15:25 -0400 Subject: [PATCH 26/39] using sm90 atomics --- include/cufinufft/spreadinterp.h | 8 +- include/cufinufft/utils.h | 22 ++++ perftest/cuda/CMakeLists.txt | 5 +- perftest/cuda/bench.py | 172 +++++++++++++++++-------------- src/cuda/1d/spreadinterp1d.cuh | 6 +- src/cuda/2d/spreadinterp2d.cuh | 14 ++- src/cuda/3d/spreadinterp3d.cuh | 19 ++-- src/cuda/CMakeLists.txt | 18 ++-- test/cuda/CMakeLists.txt | 3 + 9 files changed, 151 insertions(+), 116 deletions(-) diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index d5009de41..3866233a4 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -38,11 +38,11 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T return __fmaf_rn(a, b, c); } else if constexpr (std::is_same_v) { return __fma_rn(a, b, c); - } else { - static_assert(std::is_same_v || std::is_same_v, - "Only float and double are supported."); } -} + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + return T{0}; +}; template static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index 29645f9f9..f556da8d6 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -12,6 +12,9 @@ #include +#include +#include + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) #else __inline__ __device__ double atomicAdd(double *address, double val) { @@ -104,6 +107,25 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define COMPUTE_CAPABILITY_90_OR_HIGHER 0 #endif +template +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { + const auto raw_address = reinterpret_cast(address); + atomicAdd(raw_address, res.x); + atomicAdd(raw_address + 1, res.y); +} + +template +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { + if constexpr ( + std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { + atomicAdd(address, res); + } else { + atomicAddComplexShared(address, res); + } +} + } // namespace utils } // namespace cufinufft diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index 8f8a8a20b..04412d4e8 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -4,6 +4,7 @@ target_link_libraries(cuperftest PUBLIC cufinufft) set_target_properties(cuperftest PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) - -#file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_features(cuperftest PRIVATE cxx_std_17) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index 1d52ff884..a7fa5e6f2 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -71,89 +71,101 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -max_range = 16 if args["--prec"] == "d" else 7 +for precision in ['f', 'd']: + for dim in range(1, 4): + if dim == 1: + args["--N1"] = "16777216" + if dim == 2: + args["--N1"] = "256" + args["--N2"] = "256" + if dim == 3: + args["--N1"] = "256" + args["--N2"] = "256" + args["--N3"] = "256" + args["--prec"] = precision + max_range = 16 if args["--prec"] == "d" else 7 + for i in range(1, max_range): + args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) + print("Running with tol = 1E-" + str(i)) + for method in ['2', '1']: + args["--method"] = method + if method == '0': + data['method'].append('auto') + elif method == '1': + data['method'].append('GM') + elif method == '2': + data['method'].append('SM') + elif method == '4': + data['method'].append('BLOCK') + print("Method " + data['method'][-1]) + cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) + stdout, stderr = run_command("nsys", cmd) + if stderr != '': + print(stderr) + exit(0) + # skip all lines starting with # in stdout + conf = [x for x in stdout.splitlines() if x.startswith("#")] + print('\n'.join(conf)) + stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] + if stdout[0].startswith("bin"): + print(stdout[0]) + stdout = stdout[1:] -for i in range(1, max_range): - args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) - print("Running with tol = 1E-" + str(i)) - for method in ['2', '1']: - args["--method"] = method - if method == '0': - data['method'].append('auto') - elif method == '1': - data['method'].append('GM') - elif method == '2': - data['method'].append('SM') - elif method == '4': - data['method'].append('BLOCK') - print("Method " + data['method'][-1]) - cmd = ["profile", "--force-overwrite", "true", "-o", "cuperftest_profile", cwd + "/cuperftest"] + build_args(args) - stdout, stderr = run_command("nsys", cmd) - if stderr != '': - print(stderr) - exit(0) - # skip all lines starting with # in stdout - conf = [x for x in stdout.splitlines() if x.startswith("#")] - print('\n'.join(conf)) - stdout = [x for x in stdout.splitlines() if not x.startswith("#")][:7] - if stdout[0].startswith("bin"): - print(stdout[0]) - stdout = stdout[1:] + stdout = '\n'.join(stdout) + # convert stdout to a dataframe from csv string + dt = pd.read_csv(io.StringIO(stdout), sep=',') + setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value + exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value + print(f'setpts pts/s: {setpts}') + print(f'exec pts/s: {exec}') + cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", + "--format=csv", "--output", "cuperftest"] + stdout, _ = run_command("nsys", cmd) + # remove format from cmd + cmd = cmd[:-3] + # print(run_command("nsys", cmd)) + # print(csv) + dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") + # print(dt) + # sum the "Total Time" column of the ones that contain "fft" in name + # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) + total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() + print(f'total_fft: {total_fft}') + # drop all the rows with spread not in "Name" + dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] + # print(dt) + # exit(0) + # sort dt by column "Time (%)" + total_spread = dt['Duration (ns)'].sum() - total_fft + print(f'total_spread: {total_spread}') + if total_fft > total_spread: + print("Warning: total_fft > total_spread") + # exit(0) + # pt/s + throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread + print(f'throughput: {throughput}') + data['throughput'].append(throughput) + data['tolerance'].append(args['--tol']) + # data['setpts'].append(setpts) + data['exec'].append(exec) - stdout = '\n'.join(stdout) - # convert stdout to a dataframe from csv string - dt = pd.read_csv(io.StringIO(stdout), sep=',') - setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value - exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value - print(f'setpts pts/s: {setpts}') - print(f'exec pts/s: {exec}') - cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", - "--format=csv", "--output", "cuperftest"] - stdout, _ = run_command("nsys", cmd) - # remove format from cmd - cmd = cmd[:-3] - # print(run_command("nsys", cmd)) - # print(csv) - dt = pd.read_csv("./cuperftest_cuda_gpu_trace.csv") - # print(dt) - # sum the "Total Time" column of the ones that contain "fft" in name - # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) - total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() - print(f'total_fft: {total_fft}') - # drop all the rows with spread not in "Name" - dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] - # print(dt) - # exit(0) - # sort dt by column "Time (%)" - total_spread = dt['Duration (ns)'].sum() - total_fft - print(f'total_spread: {total_spread}') - if total_fft > total_spread: - print("Warning: total_fft > total_spread") - # exit(0) - # pt/s - throughput = float(args['--M']) * float(args['--n_runs']) * 1_000_000_000 / total_spread - print(f'throughput: {throughput}') - data['throughput'].append(throughput) - data['tolerance'].append(args['--tol']) - # data['setpts'].append(setpts) - data['exec'].append(exec) - -df = pd.DataFrame(data) -# Pivot the DataFrame -pivot_df = df.pivot(index='tolerance', columns='method') -# print(pivot_df) -# scale the throughput SM by GM -# pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] -# pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] -# scale setpts SM by GM -# pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] -# pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] -# remove the GM column -# pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) -pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) -pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) -print(pivot_df) + df = pd.DataFrame(data) + # Pivot the DataFrame + pivot_df = df.pivot(index='tolerance', columns='method') + # print(pivot_df) + # scale the throughput SM by GM + # pivot_df['throughput', 'SM'] /= pivot_df['throughput', 'GM'] + # pivot_df['throughput', 'GM'] /= pivot_df['throughput', 'GM'] + # scale setpts SM by GM + # pivot_df['exec', 'SM'] /= pivot_df['exec', 'GM'] + # pivot_df['exec', 'GM'] /= pivot_df['exec', 'GM'] + # remove the GM column + # pivot_df.drop(('throughput', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'GM'), axis=1, inplace=True) + pivot_df.drop(('exec', 'SM'), axis=1, inplace=True) + print(pivot_df) +exit(0) # Plot pivot_df.plot(kind='bar', figsize=(10, 7)) # Find the minimum throughput value diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index c7d84a9b8..56493ef73 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -138,8 +138,7 @@ __global__ void spread_1d_subprob( if (ix >= (bin_size_x + ns_2) || ix < 0) break; const cuda_complex result{cnow.x * ker1[xx - xstart], cnow.y * ker1[xx - xstart]}; - atomicAdd(&fwshared[ix].x, result.x); - atomicAdd(&fwshared[ix].y, result.y); + atomicAddComplexShared(fwshared + ix, result); } } __syncthreads(); @@ -148,8 +147,7 @@ __global__ void spread_1d_subprob( auto ix = xoffset - ns_2 + k; if (ix < (nf1 + ns_2)) { ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix); - atomicAdd(&fw[ix].x, fwshared[k].x); - atomicAdd(&fw[ix].y, fwshared[k].y); + atomicAddComplexGlobal(fw + ix, fwshared[k]); } } } diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index e8a69f303..03da3ed8a 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -53,8 +53,9 @@ __global__ void spread_2d_nupts_driven( const auto outidx = ix + iy * nf1; const auto kervalue1 = ker1[xx - xstart]; const auto kervalue2 = ker2[yy - ystart]; - atomicAdd(&fw[outidx].x, cnow.x * kervalue1 * kervalue2); - atomicAdd(&fw[outidx].y, cnow.y * kervalue1 * kervalue2); + const cuda_complex res{cnow.x * kervalue1 * kervalue2, + cnow.y * kervalue1 * kervalue2}; + atomicAddComplexGlobal(fw + outidx, res); } } } @@ -180,10 +181,8 @@ __global__ void spread_2d_subprob( if (ix >= (bin_size_x + rounded_ns) || ix < 0) break; const auto outidx = ix + iy * (bin_size_x + rounded_ns); const auto kervalue = ker1[xx - xstart] * ker2[yy - ystart]; - const auto resx = cnow.x * kervalue; - const auto resy = cnow.y * kervalue; - atomicAdd(&fwshared[outidx].x, resx); - atomicAdd(&fwshared[outidx].y, resy); + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); } } } @@ -200,8 +199,7 @@ __global__ void spread_2d_subprob( iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy); const auto outidx = ix + iy * nf1; const auto sharedidx = i + j * (bin_size_x + rounded_ns); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } } } diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh index 19eae72a4..59b4661ff 100644 --- a/src/cuda/3d/spreadinterp3d.cuh +++ b/src/cuda/3d/spreadinterp3d.cuh @@ -127,8 +127,9 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const int outidx = ix + iy * nf1 + iz * nf1 * nf2; const auto ker1val = ker1[xx - xstart]; const auto kervalue = ker1val * ker2val * ker3val; - atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue); - atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue); + const cuda_complex res{c[idxnupts[i]].x * kervalue, + c[idxnupts[i]].y * kervalue}; + atomicAddComplexGlobal(fw + outidx, res); } } } @@ -223,10 +224,8 @@ __global__ void spread_3d_subprob( const int outidx = ix + iy * (bin_size_x + rounded_ns) + iz * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); const auto kervalue = ker1[xx - xstart] * kervalue2 * kervalue3; - const auto resx = cnow.x * kervalue; - const auto resy = cnow.y * kervalue; - atomicAdd(&fwshared[outidx].x, resx); - atomicAdd(&fwshared[outidx].y, resy); + const cuda_complex res{cnow.x * kervalue, cnow.y * kervalue}; + atomicAddComplexShared(fwshared + outidx, res); } } } @@ -250,8 +249,7 @@ __global__ void spread_3d_subprob( const int outidx = ix + iy * nf1 + iz * nf1 * nf2; const int sharedidx = i + j * (bin_size_x + rounded_ns) + k * (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); - atomicAdd(&fw[outidx].x, fwshared[sharedidx].x); - atomicAdd(&fw[outidx].y, fwshared[sharedidx].y); + atomicAddComplexGlobal(fw + outidx, fwshared[sharedidx]); } } } @@ -408,8 +406,9 @@ __global__ void spread_3d_block_gather( for (int xx = xstartnew; xx <= xendnew; xx++) { const auto outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x; const T kervalue1 = ker1[xx - xstart]; - atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3); - atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3); + const cuda_complex res{cnow.x * kervalue1 * kervalue2 * kervalue3, + cnow.y * kervalue1 * kervalue2 * kervalue3}; + atomicAddComplexShared(fwshared + outidx, res); } } } diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 751ccfc6c..69eb0597c 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -30,6 +30,8 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_compile_options(cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -42,6 +44,8 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) target_compile_features(cufinufft_objects PRIVATE cxx_std_17) target_compile_options(cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -51,20 +55,16 @@ if (FINUFFT_SHARED_LINKING) $ $ ) - set_target_properties( - cufinufft PROPERTIES - LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - ) else () add_library(cufinufft STATIC $ $ ) - set_target_properties( - cufinufft PROPERTIES - ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - ) endif () +set_target_properties( + cufinufft PROPERTIES + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" +) if (WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) @@ -80,4 +80,6 @@ set_target_properties( POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 8d77d9fdc..6555d4f64 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,9 +7,12 @@ foreach(srcfile ${test_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) + target_compile_features(${executable} PRIVATE cxx_std_17) set_target_properties(${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON ) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" From 041a536819945eb606771743b6ac6ab4ba95b6a0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 25 Jul 2024 17:41:35 -0400 Subject: [PATCH 27/39] updated script --- perftest/cuda/bench.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index a7fa5e6f2..d01a67668 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -49,13 +49,7 @@ def build_args(args): "--M": "1E8", "--tol": "1E-6"} # iterate over tol from 1E-6 to 1E-1 -data = { - 'method': [], - 'throughput': [], - 'tolerance': [], - # 'setpts': [], - 'exec': [], -} + warmup = {"--prec": "f", "--n_runs": "1", "--method": "0", @@ -71,7 +65,8 @@ def build_args(args): if stderr != '': print(stderr) exit(0) -for precision in ['f', 'd']: +for precision in ['d']: + print(f"precision: {precision}") for dim in range(1, 4): if dim == 1: args["--N1"] = "16777216" @@ -84,6 +79,16 @@ def build_args(args): args["--N3"] = "256" args["--prec"] = precision max_range = 16 if args["--prec"] == "d" else 7 + if precision == 'd' and dim == 3: + max_range = 6 + print(f"dimensions {dim}") + data = { + 'method': [], + 'throughput': [], + 'tolerance': [], + # 'setpts': [], + 'exec': [], + } for i in range(1, max_range): args["--tol"] = "1E-" + ("0" if i < 10 else "") + str(i) print("Running with tol = 1E-" + str(i)) @@ -116,8 +121,8 @@ def build_args(args): dt = pd.read_csv(io.StringIO(stdout), sep=',') setpts = dt[dt["event"].str.contains("setpts")]['nupts/s'].sum() # it is only one row it extracts the value exec = dt[dt["event"].str.contains("exec")]['nupts/s'].sum() # it is only one row it extracts the value - print(f'setpts pts/s: {setpts}') - print(f'exec pts/s: {exec}') + # print(f'setpts pts/s: {setpts}') + # print(f'exec pts/s: {exec}') cmd = ["stats", "--force-overwrite=true", "--force-export=true", "--report", "cuda_gpu_trace", "--report", "cuda_gpu_kern_sum", "cuperftest_profile.nsys-rep", "--format=csv", "--output", "cuperftest"] stdout, _ = run_command("nsys", cmd) @@ -130,14 +135,14 @@ def build_args(args): # sum the "Total Time" column of the ones that contain "fft" in name # print(dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]) total_fft = dt[dt["Name"].str.contains("fft") & ~dt["Name"].str.contains("cufinufft")]['Duration (ns)'].sum() - print(f'total_fft: {total_fft}') + # print(f'total_fft: {total_fft}') # drop all the rows with spread not in "Name" dt = dt[dt["Name"].str.contains("cufinufft::spreadinterp::spread")] # print(dt) # exit(0) # sort dt by column "Time (%)" total_spread = dt['Duration (ns)'].sum() - total_fft - print(f'total_spread: {total_spread}') + # print(f'total_spread: {total_spread}') if total_fft > total_spread: print("Warning: total_fft > total_spread") # exit(0) @@ -148,8 +153,6 @@ def build_args(args): data['tolerance'].append(args['--tol']) # data['setpts'].append(setpts) data['exec'].append(exec) - - df = pd.DataFrame(data) # Pivot the DataFrame pivot_df = df.pivot(index='tolerance', columns='method') From 54683c3c6b2c49fc25a3a3b00c88a39bef2b0263 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 11:36:27 -0400 Subject: [PATCH 28/39] fixed bin sizes --- include/cufinufft/impl.h | 4 ++-- perftest/cuda/bench.py | 4 ++-- src/cuda/common.cu | 7 ++----- 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 7d63df51e..3d6e99b35 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -143,10 +143,10 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * * For type 2, we always default to method 1 (GM). */ - // query the device for the amount of shared memory available - if (dim == 3 && std::is_same_v) { + if (d_plan->type == 2) { d_plan->opts.gpu_method = 1; } else { + // query the device for the amount of shared memory available int shared_mem_per_block{}; cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); diff --git a/perftest/cuda/bench.py b/perftest/cuda/bench.py index d01a67668..c22c2af9f 100644 --- a/perftest/cuda/bench.py +++ b/perftest/cuda/bench.py @@ -54,7 +54,7 @@ def build_args(args): "--n_runs": "1", "--method": "0", "--N1": "256", - "--N2": "256", + # "--N2": "256", # "--N3": "256", "--M": "256", "--tol": "1E-1"} @@ -67,7 +67,7 @@ def build_args(args): exit(0) for precision in ['d']: print(f"precision: {precision}") - for dim in range(1, 4): + for dim in range(1, 2): if dim == 1: args["--N1"] = "16777216" if dim == 2: diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 8499aea8a..eba170a24 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -256,13 +256,10 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { if (const auto err = cudaGetLastError(); err != cudaSuccess) { throw std::runtime_error(cudaGetErrorString(err)); } - // use half of the available shared memory if double precision - if constexpr (std::is_same_v) { - shared_mem_per_block /= 2; - } + // use 1/6 of the shared memory for the binsize + shared_mem_per_block /= 6; const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; - opts->gpu_binsizex = bin_size; } opts->gpu_binsizey = 1; From dc3a62877cd39b1c0d71778a50a6074df2373c6b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 12:01:43 -0400 Subject: [PATCH 29/39] using floor in fold_rescale updated changelog --- CHANGELOG | 9 +++++++ include/cufinufft/spreadinterp.h | 44 +++++++++++++++----------------- 2 files changed, 30 insertions(+), 23 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index 000e03b6f..ba024e07f 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -38,6 +38,15 @@ V 2.3.0beta (7/21/24) any 32-bit integers to 64-bit when calling cufinufft(f)_setpts. Note that internally, 32-bit integers are still used, so calling cufinufft with more than 2e9 points will fail. This restriction may be lifted in the future. +* cuFINUFFT binsize is now a function of the shared memory available where + possible. +* cuFINUFFT GM 1D sorts using thrust::sort instead of bin-sort. +* cuFINUFFT using the new normalized Horner coefficients and added support + for 1.25. +* cuFINUFFT new compile flags for extra-vectorization, flushing single + precision denormals to 0 and using fma where possible. +* cuFINUFFT using intrinsics in foldrescale and other places to increase + performance V 2.2.0 (12/12/23) diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 3866233a4..0ab7aba9a 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -10,40 +10,38 @@ namespace cufinufft { namespace spreadinterp { template -constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { - constexpr const auto x2pi = T(0.159154943091895345554011992339482617); - constexpr const auto half = T(0.5); +static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { + if constexpr (std::is_same_v) { + return __fmaf_rn(a, b, c); + } else if constexpr (std::is_same_v) { + return __fma_rn(a, b, c); + } + static_assert(std::is_same_v || std::is_same_v, + "Only float and double are supported."); + return T{0}; +} + +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) { + constexpr auto x2pi = T(0.159154943091895345554011992339482617); + constexpr auto half = T(0.5); #if defined(__CUDA_ARCH__) if constexpr (std::is_same_v) { - auto result = __fmaf_rn(x, x2pi, half); - result = __fsub_rd(result, truncf(result)); - return __fmul_rd(result, static_cast(N)); + const auto result = fma(x, x2pi, half); + return (result - floorf(result)) * static_cast(N); } else if constexpr (std::is_same_v) { - auto result = __fma_rn(x, x2pi, half); - result = __dsub_rd(result, trunc(result)); - return __dmul_rd(result, static_cast(N)); + const auto result = fma(x, x2pi, half); + return (result - floor(result)) * static_cast(N); } else { static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); } #else - const auto result = std::fma(x, x2pi, half); - return (result - std::trunc(result)) * static_cast(N); + const auto result = fma(x, x2pi, half); + return (result - std::floor(result)) * static_cast(N); #endif } -template -static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { - if constexpr (std::is_same_v) { - return __fmaf_rn(a, b, c); - } else if constexpr (std::is_same_v) { - return __fma_rn(a, b, c); - } - static_assert(std::is_same_v || std::is_same_v, - "Only float and double are supported."); - return T{0}; -}; - template static inline T evaluate_kernel(T x, const finufft_spread_opts &opts) /* ES ("exp sqrt") kernel evaluation at single real argument: From b3237f7e29a75232b03e6ce4bc2d5703fe811cb8 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 12:16:53 -0400 Subject: [PATCH 30/39] fixed a mistake --- include/cufinufft/impl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 3d6e99b35..dcf00f31b 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -143,7 +143,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran * * For type 2, we always default to method 1 (GM). */ - if (d_plan->type == 2) { + if (type == 2) { d_plan->opts.gpu_method = 1; } else { // query the device for the amount of shared memory available From db80aad0f21cedccc85d1eca211c4286a18a198e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 26 Jul 2024 15:44:47 -0400 Subject: [PATCH 31/39] added comments for review --- CHANGELOG | 1 + include/cufinufft/impl.h | 1 + include/cufinufft/spreadinterp.h | 24 +++++++++++++++++------- include/cufinufft/utils.h | 25 ++++++++++++++++++++----- src/cuda/common.cu | 19 +++++++++---------- 5 files changed, 48 insertions(+), 22 deletions(-) diff --git a/CHANGELOG b/CHANGELOG index ba024e07f..d25d7e5d7 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -47,6 +47,7 @@ V 2.3.0beta (7/21/24) precision denormals to 0 and using fma where possible. * cuFINUFFT using intrinsics in foldrescale and other places to increase performance +* cuFINUFFT using SM90 float2 vector atomicAdd where supported V 2.2.0 (12/12/23) diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index dcf00f31b..c3021a7ff 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -60,6 +60,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran Variables and arrays inside the plan struct are set and allocated. Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21. + Marco Barbone 07/26/24. Using SM when shared memory available is enough. */ int ier; cuDoubleComplex *d_a = nullptr; // fseries temp data diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h index 0ab7aba9a..2963d381d 100644 --- a/include/cufinufft/spreadinterp.h +++ b/include/cufinufft/spreadinterp.h @@ -12,8 +12,10 @@ namespace spreadinterp { template static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T c) { if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fmaf_rn(a, b, c); } else if constexpr (std::is_same_v) { + // fused multiply-add, round to nearest even return __fma_rn(a, b, c); } static_assert(std::is_same_v || std::is_same_v, @@ -21,23 +23,31 @@ static __forceinline__ __device__ constexpr T fma(const T a, const T b, const T return T{0}; } -template -constexpr __forceinline__ __host__ __device__ T fold_rescale(const T x, const V N) { +template +constexpr __forceinline__ __host__ __device__ T fold_rescale(T x, int N) { constexpr auto x2pi = T(0.159154943091895345554011992339482617); constexpr auto half = T(0.5); #if defined(__CUDA_ARCH__) if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floorf(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fmaf_rn(x, x2pi, half); + // subtract, round down + result = __fsub_rd(result, floorf(result)); + // multiply, round down + return __fmul_rd(result, static_cast(N)); } else if constexpr (std::is_same_v) { - const auto result = fma(x, x2pi, half); - return (result - floor(result)) * static_cast(N); + // fused multiply-add, round to nearest even + auto result = __fma_rn(x, x2pi, half); + // subtract, round down + result = __dsub_rd(result, floor(result)); + // multiply, round down + return __dmul_rd(result, static_cast(N)); } else { static_assert(std::is_same_v || std::is_same_v, "Only float and double are supported."); } #else - const auto result = fma(x, x2pi, half); + const auto result = std::fma(x, x2pi, half); return (result - std::floor(result)) * static_cast(N); #endif } diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index f556da8d6..b4db528ae 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -74,11 +74,14 @@ template T infnorm(int n, std::complex *a) { #ifdef __CUDA_ARCH__ __forceinline__ __device__ auto interval(const int ns, const float x) { + // float to int round up and fused multiply-add to round up const auto xstart = __float2int_ru(__fmaf_ru(ns, -.5f, x)); - const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); + // float to int round down and fused multiply-add to round down + const auto xend = __float2int_rd(__fmaf_rd(ns, .5f, x)); return int2{xstart, xend}; } __forceinline__ __device__ auto interval(const int ns, const double x) { + // same as above const auto xstart = __double2int_ru(__fma_ru(ns, -.5, x)); const auto xend = __double2int_rd(__fma_rd(ns, .5, x)); return int2{xstart, xend}; @@ -107,17 +110,29 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #define COMPUTE_CAPABILITY_90_OR_HIGHER 0 #endif +/** + * does a complex atomic add on a shared memory address + * it adds the real and imaginary parts separately + * cuda does not support atomic operations + * on complex numbers on shared memory directly + */ + template -static __forceinline__ __device__ void atomicAddComplexShared( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, + cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); } +/** + * does a complex atomic add on a global memory address + * since cuda 90 atomic operations on complex numbers + * on shared memory are supported so we leverage them + */ template -static __forceinline__ __device__ void atomicAddComplexGlobal( - cuda_complex *address, cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, + cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/common.cu b/src/cuda/common.cu index eba170a24..19b0cbd1a 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -202,8 +202,7 @@ void onedim_fseries_kernel_compute(CUFINUFFT_BIGINT nf, T *f, std::complex std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size_y, int bin_size_z) { - // printf("dim, ns, bin_size_x, bin_size_y, bin_size_z: %d %d %d %d %d\n", dim, ns, - // bin_size_x, bin_size_y, bin_size_z); + // Helper to compute the shared memory required for the spreader when using SM int adjusted_ns = bin_size_x + ((ns + 1) / 2) * 2; if (dim == 1) { @@ -221,17 +220,18 @@ std::size_t shared_memory_required(int dim, int ns, int bin_size_x, int bin_size return adjusted_ns * sizeof(cuda_complex); } -// Function to find bin_size_x == bin_size_y where bin_size_x * bin_size_y < MemSize -template int find_bin_size(std::size_t MemSize, int dim, int ns) { +// Function to find bin_size_x == bin_size_y +// where bin_size_x * bin_size_y * bin_size_z < mem_size +// TODO: this can be done without a loop by using a direct formula +template int find_bin_size(std::size_t mem_size, int dim, int ns) { int binsize = 1; // Start with the smallest possible bin size - while (true) { // Calculate the shared memory required for the current bin_size_x and bin_size_y std::size_t required_memory = shared_memory_required(dim, ns, binsize, binsize, binsize); // Check if the required memory is less than the available memory - if (required_memory > MemSize) { + if (required_memory > mem_size) { // If the condition is met, return the current bin_size_x return binsize - 1; } @@ -243,6 +243,9 @@ template int find_bin_size(std::size_t MemSize, int dim, int ns) { template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { + // Marco Barbone 07/26/24. Using the shared memory available on the device, to + // determine the optimal binsize for the spreader. + // TODO: This can still be improved some sizes are hardcoded still int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { @@ -290,10 +293,6 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } break; } } - // const auto shared_mem_required = shared_memory_required( - // dim, ns, opts->gpu_binsizex, opts->gpu_binsizey, opts->gpu_binsizez); - // printf("binsizex: %d, binsizey: %d, shared_mem_required %ld (bytes)\n", - // opts->gpu_binsizex, opts->gpu_binsizey, shared_mem_required); opts->gpu_binsizez = 1; } break; case 3: { From c225fb56eac9b288fd518ea875e4b7ca74ed19ba Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 31 Jul 2024 12:42:52 -0400 Subject: [PATCH 32/39] fixing review comments --- src/cuda/common.cu | 2 + src/ker_horner_allw_loop.inc | 207 ----------------------------------- 2 files changed, 2 insertions(+), 207 deletions(-) delete mode 100644 src/ker_horner_allw_loop.inc diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 19b0cbd1a..6e7064b25 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -260,6 +260,8 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { throw std::runtime_error(cudaGetErrorString(err)); } // use 1/6 of the shared memory for the binsize + // From experiments on multiple GPUs this gives the best tradeoff. + // It is within 90% of the maximum performance for all GPUs tested. shared_mem_per_block /= 6; const int bin_size = shared_mem_per_block / sizeof(cuda_complex) - ((ns + 1) / 2) * 2; diff --git a/src/ker_horner_allw_loop.inc b/src/ker_horner_allw_loop.inc deleted file mode 100644 index 953c4618b..000000000 --- a/src/ker_horner_allw_loop.inc +++ /dev/null @@ -1,207 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - constexpr FLT c0[] = {4.5147043243215343E+01, 4.5147043243215350E+01}; - constexpr FLT c1[] = {5.7408070938221307E+01, -5.7408070938221300E+01}; - constexpr FLT c2[] = {-1.8395117920046544E+00, -1.8395117920046602E+00}; - constexpr FLT c3[] = {-2.0382426253182064E+01, 2.0382426253182086E+01}; - constexpr FLT c4[] = {-2.0940804433577389E+00, -2.0940804433577398E+00}; - for (int i=0; i<2; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==3) { - constexpr FLT c0[] = {1.5653991189315130E+02, 8.8006872410780375E+02, 1.5653991189967169E+02}; - constexpr FLT c1[] = {3.1653018869611083E+02, 2.7828437114531882E-14, -3.1653018868907077E+02}; - constexpr FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117128E+02}; - constexpr FLT c3[] = {-1.5357716116473071E+01, 1.0675641863333163E-13, 1.5357716122720211E+01}; - constexpr FLT c4[] = {-3.7757583061523640E+01, 5.3222970968867450E+01, -3.7757583054647341E+01}; - constexpr FLT c5[] = {-3.9654011076088449E+00, 4.9521033695040343E-14, 3.9654011139270429E+00}; - for (int i=0; i<3; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - constexpr FLT c0[] = {5.4284366850213223E+02, 1.0073871433088407E+04, 1.0073871433088407E+04, 5.4284366850213269E+02}; - constexpr FLT c1[] = {1.4650917259256942E+03, 6.1905285583602899E+03, -6.1905285583602899E+03, -1.4650917259256942E+03}; - constexpr FLT c2[] = {1.4186910680718349E+03, -1.3995339862725573E+03, -1.3995339862725571E+03, 1.4186910680718345E+03}; - constexpr FLT c3[] = {5.1133995502497481E+02, -1.4191608683682980E+03, 1.4191608683682985E+03, -5.1133995502497402E+02}; - constexpr FLT c4[] = {-4.8293622641173705E+01, 3.9393732546135901E+01, 3.9393732546136945E+01, -4.8293622641173727E+01}; - constexpr FLT c5[] = {-7.8386867802392203E+01, 1.4918904800408794E+02, -1.4918904800408947E+02, 7.8386867802392203E+01}; - constexpr FLT c6[] = {-1.0039212571700403E+01, 5.0626747735617119E+00, 5.0626747735622777E+00, -1.0039212571700599E+01}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - constexpr FLT c0[] = {9.9223677575398506E+02, 3.7794697666613349E+04, 9.8715771010760567E+04, 3.7794697666613327E+04, 9.9223677575398540E+02}; - constexpr FLT c1[] = {3.0430174925083834E+03, 3.7938404259811425E+04, -4.1880997701304513E-12, -3.7938404259811403E+04, -3.0430174925083829E+03}; - constexpr FLT c2[] = {3.6092689177271232E+03, 7.7501368899498630E+03, -2.2704627332475000E+04, 7.7501368899498721E+03, 3.6092689177271213E+03}; - constexpr FLT c3[] = {1.9990077310495410E+03, -3.8875294641277214E+03, 1.6137850891850780E-11, 3.8875294641277346E+03, -1.9990077310495410E+03}; - constexpr FLT c4[] = {4.0071733590403909E+02, -1.5861137916762543E+03, 2.3839858699098786E+03, -1.5861137916762577E+03, 4.0071733590403909E+02}; - constexpr FLT c5[] = {-9.1301168206167233E+01, 1.2316471075215087E+02, 1.9401736511657983E-12, -1.2316471075215495E+02, 9.1301168206166977E+01}; - constexpr FLT c6[] = {-5.5339722671222894E+01, 1.1960590540262304E+02, -1.5249941358312140E+02, 1.1960590540262024E+02, -5.5339722671224088E+01}; - constexpr FLT c7[] = {-3.3762488150349581E+00, 2.2839981873006558E+00, 8.2819625836083788E-12, -2.2839981872910400E+00, 3.3762488150351579E+00}; - for (int i=0; i<5; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - constexpr FLT c0[] = {2.0553833234911899E+03, 1.5499537739913145E+05, 8.1177907023291232E+05, 8.1177907023291232E+05, 1.5499537739913145E+05, 2.0553833235005700E+03}; - constexpr FLT c1[] = {7.1269776034442684E+03, 2.0581923258843319E+05, 3.1559612614917679E+05, -3.1559612614917639E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}; - constexpr FLT c2[] = {1.0023404568475091E+04, 9.0916650498360206E+04, -1.0095927514054631E+05, -1.0095927514054631E+05, 9.0916650498360163E+04, 1.0023404568484637E+04}; - constexpr FLT c3[] = {7.2536109410387444E+03, 4.8347162752603444E+03, -5.0512736602018485E+04, 5.0512736602018602E+04, -4.8347162752602972E+03, -7.2536109410297577E+03}; - constexpr FLT c4[] = {2.7021878300949775E+03, -7.8773465553972374E+03, 5.2105876478343516E+03, 5.2105876478343944E+03, -7.8773465553972464E+03, 2.7021878301048723E+03}; - constexpr FLT c5[] = {3.2120291706547630E+02, -1.8229189469936912E+03, 3.7928113414428476E+03, -3.7928113414427171E+03, 1.8229189469937239E+03, -3.2120291705638328E+02}; - constexpr FLT c6[] = {-1.2051267090537345E+02, 2.2400507411399769E+02, -1.2506575852547746E+02, -1.2506575852531816E+02, 2.2400507411399730E+02, -1.2051267089640162E+02}; - constexpr FLT c7[] = {-4.5977202613346755E+01, 1.1536880606857032E+02, -1.7819720186492938E+02, 1.7819720186504426E+02, -1.1536880606851560E+02, 4.5977202622148354E+01}; - constexpr FLT c8[] = {-1.5631081288822022E+00, 7.1037430590520445E-01, -6.9838401262032682E-02, -6.9838401199524530E-02, 7.1037430591562767E-01, -1.5631081203751171E+00}; - for (int i=0; i<6; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - constexpr FLT c0[] = {3.9948351830487582E+03, 5.4715865608590841E+05, 5.0196413492771825E+06, 9.8206709220713321E+06, 5.0196413492771871E+06, 5.4715865608590853E+05, 3.9948351830642619E+03}; - constexpr FLT c1[] = {1.5290160332974698E+04, 8.7628248584320419E+05, 3.4421061790934466E+06, 6.5103105025927563E-10, -3.4421061790934466E+06, -8.7628248584320443E+05, -1.5290160332958061E+04}; - constexpr FLT c2[] = {2.4458227486779258E+04, 5.3904618484139442E+05, 2.4315566181017563E+05, -1.6133959371974308E+06, 2.4315566181017424E+05, 5.3904618484139396E+05, 2.4458227486795091E+04}; - constexpr FLT c3[] = {2.1166189345881652E+04, 1.3382732160223150E+05, -3.3113450969689601E+05, 2.5683270626620309E-10, 3.3113450969689793E+05, -1.3382732160223130E+05, -2.1166189345866896E+04}; - constexpr FLT c4[] = {1.0542795672344870E+04, -7.0739172265096349E+03, -6.5563293056048627E+04, 1.2429734005960199E+05, -6.5563293056048671E+04, -7.0739172265096395E+03, 1.0542795672361222E+04}; - constexpr FLT c5[] = {2.7903491906228451E+03, -1.0975382873972989E+04, 1.3656979541145318E+04, 4.9801640867456605E-10, -1.3656979541144143E+04, 1.0975382873973054E+04, -2.7903491906078325E+03}; - constexpr FLT c6[] = {1.6069721418054232E+02, -1.5518707872249406E+03, 4.3634273936649897E+03, -5.9891976420600004E+03, 4.3634273936636964E+03, -1.5518707872250636E+03, 1.6069721419532380E+02}; - constexpr FLT c7[] = {-1.2289277373866669E+02, 2.8583630927761948E+02, -2.8318194617245649E+02, -3.5832266061541795E-11, 2.8318194617438041E+02, -2.8583630927744588E+02, 1.2289277375319726E+02}; - constexpr FLT c8[] = {-3.2270164914244575E+01, 9.1892112257588494E+01, -1.6710678096380749E+02, 2.0317049305436126E+02, -1.6710678096299210E+02, 9.1892112257580479E+01, -3.2270164900216493E+01}; - constexpr FLT c9[] = {-1.4761409684320093E-01, -9.1862771282699351E-01, 1.2845147740384601E+00, -5.0335941641611417E-10, -1.2845147731561353E+00, 9.1862771293147938E-01, 1.4761410890830065E-01}; - for (int i=0; i<7; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - constexpr FLT c0[] = {7.3898000697448142E+03, 1.7297637497600052E+06, 2.5578341605285820E+07, 8.4789650417103425E+07, 8.4789650417103410E+07, 2.5578341605285831E+07, 1.7297637497600054E+06, 7.3898000697448097E+03}; - constexpr FLT c1[] = {3.0719636811267621E+04, 3.1853145713323932E+06, 2.3797981861403704E+07, 2.4569731244678468E+07, -2.4569731244678475E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267595E+04}; - constexpr FLT c2[] = {5.4488498478251728E+04, 2.4101183255475122E+06, 6.4554051283428418E+06, -8.9200440393090658E+06, -8.9200440393090583E+06, 6.4554051283428296E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; - constexpr FLT c3[] = {5.3926359802542138E+04, 9.0469037926849385E+05, -6.0897036277695757E+05, -3.0743852105799988E+06, 3.0743852105800197E+06, 6.0897036277696723E+05, -9.0469037926849280E+05, -5.3926359802542152E+04}; - constexpr FLT c4[] = {3.2444118016247576E+04, 1.3079802224392162E+05, -5.8652889370128687E+05, 4.2333306008153327E+05, 4.2333306008153543E+05, -5.8652889370128710E+05, 1.3079802224392179E+05, 3.2444118016247601E+04}; - constexpr FLT c5[] = {1.1864306345505300E+04, -2.2700360645707835E+04, -5.0713607251411129E+04, 1.8308704458211461E+05, -1.8308704458211147E+05, 5.0713607251410089E+04, 2.2700360645707704E+04, -1.1864306345505296E+04}; - constexpr FLT c6[] = {2.2812256770903396E+03, -1.1569135767377908E+04, 2.0942387020802456E+04, -1.1661592834947036E+04, -1.1661592834946512E+04, 2.0942387020804370E+04, -1.1569135767377549E+04, 2.2812256770903291E+03}; - constexpr FLT c7[] = {8.5503535636977634E+00, -9.7513976461196773E+02, 3.8242995179186414E+03, -6.9201295567263214E+03, 6.9201295567309990E+03, -3.8242995179140653E+03, 9.7513976461263269E+02, -8.5503535636935535E+00}; - constexpr FLT c8[] = {-1.0230637348345098E+02, 2.8246898554249236E+02, -3.8638201738252542E+02, 1.9106407992706994E+02, 1.9106407993520349E+02, -3.8638201738414602E+02, 2.8246898554297724E+02, -1.0230637348344338E+02}; - constexpr FLT c9[] = {-1.9200143062942033E+01, 6.1692257626381128E+01, -1.2981109187954436E+02, 1.8681284209765820E+02, -1.8681284209914423E+02, 1.2981109187880136E+02, -6.1692257626381128E+01, 1.9200143062947838E+01}; - constexpr FLT c10[] = {3.7894993761363543E-01, -1.7334408835887836E+00, 2.5271184092462979E+00, -1.2600963912775105E+00, -1.2600963880718390E+00, 2.5271184126204269E+00, -1.7334408829982433E+00, 3.7894993761427903E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==9) { - constexpr FLT c0[] = {1.3136365370186153E+04, 5.0196413492771843E+06, 1.1303327711722577E+08, 5.8225443924996734E+08, 9.7700272582690716E+08, 5.8225443924996805E+08, 1.1303327711722578E+08, 5.0196413492772263E+06, 1.3136365370186144E+04}; - constexpr FLT c1[] = {5.8623313038274369E+04, 1.0326318537280345E+07, 1.2898448324824868E+08, 3.0522863709830379E+08, 7.2435840302079811E-08, -3.0522863709830397E+08, -1.2898448324824865E+08, -1.0326318537280394E+07, -5.8623313038274347E+04}; - constexpr FLT c2[] = {1.1335001341875960E+05, 9.0726133144784812E+06, 5.3501544534038134E+07, -2.6789524644140172E+05, -1.2483923718899371E+08, -2.6789524644173466E+05, 5.3501544534038089E+07, 9.0726133144785147E+06, 1.1335001341875963E+05}; - constexpr FLT c3[] = {1.2489113703229754E+05, 4.3035547171861976E+06, 6.3021978510599164E+06, -2.6014941986658975E+07, 5.3074599277157087E-08, 2.6014941986659400E+07, -6.3021978510598680E+06, -4.3035547171862088E+06, -1.2489113703229751E+05}; - constexpr FLT c4[] = {8.6425493435991244E+04, 1.0891182836653311E+06, -2.0713033564200432E+06, -2.8994941183505901E+06, 7.5905338661206560E+06, -2.8994941183505324E+06, -2.0713033564200350E+06, 1.0891182836653385E+06, 8.6425493435991288E+04}; - constexpr FLT c5[] = {3.8657354724013800E+04, 7.9936390113329253E+04, -7.0458265546791849E+05, 1.0151095605715540E+06, 7.5990350518026299E-08, -1.0151095605718379E+06, 7.0458265546793933E+05, -7.9936390113333939E+04, -3.8657354724013821E+04}; - constexpr FLT c6[] = {1.0779131453134645E+04, -3.3466718311300116E+04, -1.3245366618985940E+04, 1.8238470515354761E+05, -2.9285656292981049E+05, 1.8238470515352563E+05, -1.3245366618989963E+04, -3.3466718311299133E+04, 1.0779131453134627E+04}; - constexpr FLT c7[] = {1.4992527030548656E+03, -9.7024371533879767E+03, 2.3216330734078529E+04, -2.3465262819038293E+04, -4.5678067266366728E-08, 2.3465262819229152E+04, -2.3216330734050898E+04, 9.7024371533899721E+03, -1.4992527030548690E+03}; - constexpr FLT c8[] = {-7.9857427421152821E+01, -4.0585588534976301E+02, 2.6054813773370911E+03, -6.1806593581469824E+03, 8.0679596873459095E+03, -6.1806593581737125E+03, 2.6054813773390433E+03, -4.0585588535087578E+02, -7.9857427421118601E+01}; - constexpr FLT c9[] = {-7.1572272057928345E+01, 2.2785637019390455E+02, -3.9109820766111051E+02, 3.3597424707310040E+02, -1.3908671051550088E-08, -3.3597424727519922E+02, 3.9109820767448468E+02, -2.2785637019111829E+02, 7.1572272057948652E+01}; - constexpr FLT c10[] = {-9.8886360697883688E+00, 3.5359026950204516E+01, -8.5251867695464611E+01, 1.4285748013461193E+02, -1.6935269664190733E+02, 1.4285748014610570E+02, -8.5251867686017064E+01, 3.5359026947336602E+01, -9.8886360697963340E+00}; - for (int i=0; i<9; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - constexpr FLT c0[] = {2.2594586605749224E+04, 1.3595989066786611E+07, 4.4723032442444932E+08, 3.3781755837397552E+09, 8.6836783895849857E+09, 8.6836783895849838E+09, 3.3781755837397523E+09, 4.4723032442444944E+08, 1.3595989066786496E+07, 2.2594586605749344E+04}; - constexpr FLT c1[] = {1.0729981697645644E+05, 3.0651490267742995E+07, 5.9387966085130477E+08, 2.4434902657508349E+09, 2.0073077861288934E+09, -2.0073077861288950E+09, -2.4434902657508330E+09, -5.9387966085130477E+08, -3.0651490267742828E+07, -1.0729981697645634E+05}; - constexpr FLT c2[] = {2.2340399734184612E+05, 3.0258214643190462E+07, 3.1512411458738238E+08, 4.3618276932319850E+08, -7.8178848450497270E+08, -7.8178848450497031E+08, 4.3618276932319820E+08, 3.1512411458738214E+08, 3.0258214643190324E+07, 2.2340399734184553E+05}; - constexpr FLT c3[] = {2.6917433004353492E+05, 1.6875651476661246E+07, 7.4664745481963649E+07, -9.5882157211117968E+07, -2.0622994435532477E+08, 2.0622994435532823E+08, 9.5882157211118430E+07, -7.4664745481963366E+07, -1.6875651476661157E+07, -2.6917433004353428E+05}; - constexpr FLT c4[] = {2.0818422772177897E+05, 5.6084730690362593E+06, 1.4435118192351859E+06, -4.0063869969544269E+07, 3.2803674392747816E+07, 3.2803674392746560E+07, -4.0063869969546124E+07, 1.4435118192352206E+06, 5.6084730690362155E+06, 2.0818422772177868E+05}; - constexpr FLT c5[] = {1.0781139496011086E+05, 9.9202615851199278E+05, -3.3266265543961083E+06, -4.8557049011452327E+05, 1.0176155522772400E+07, -1.0176155522773268E+07, 4.8557049011599307E+05, 3.3266265543962419E+06, -9.9202615851196356E+05, -1.0781139496011072E+05}; - constexpr FLT c6[] = {3.7380102688153638E+04, 1.2716675000361241E+04, -6.2163527451762755E+05, 1.4157962667184302E+06, -8.4419693137719855E+05, -8.4419693137682532E+05, 1.4157962667184921E+06, -6.2163527451772091E+05, 1.2716675000342160E+04, 3.7380102688153478E+04}; - constexpr FLT c7[] = {8.1238936393894573E+03, -3.4872365530440075E+04, 2.3913680325287874E+04, 1.2428850301835715E+05, -3.2158255329711520E+05, 3.2158255329964001E+05, -1.2428850301842803E+05, -2.3913680325138281E+04, 3.4872365530466821E+04, -8.1238936393894610E+03}; - constexpr FLT c8[] = {7.8515926628982811E+02, -6.6607899119346384E+03, 2.0167398338412942E+04, -2.8951401344643764E+04, 1.4622828141516249E+04, 1.4622828142773422E+04, -2.8951401346273171E+04, 2.0167398338466974E+04, -6.6607899119428766E+03, 7.8515926628979298E+02}; - constexpr FLT c9[] = {-1.0147176570538747E+02, -3.5304284178326540E+01, 1.3576976855470537E+03, -4.3921059355373945E+03, 7.3232085265656797E+03, -7.3232085282537992E+03, 4.3921059362506849E+03, -1.3576976853984515E+03, 3.5304284186128150E+01, 1.0147176570552679E+02}; - constexpr FLT c10[] = {-4.3161545259359876E+01, 1.5498490982726668E+02, -3.1771250761814974E+02, 3.7215448796966825E+02, -1.7181762811175784E+02, -1.7181762918070896E+02, 3.7215448823960344E+02, -3.1771250765054128E+02, 1.5498490982861634E+02, -4.3161545259484186E+01}; - constexpr FLT c11[] = {-4.2916172038642904E+00, 1.7402146073587435E+01, -4.7947588063038118E+01, 9.2697697961204668E+01, -1.2821427624698006E+02, 1.2821427667135228E+02, -9.2697698383138089E+01, 4.7947588092305367E+01, -1.7402146072063207E+01, 4.2916172038214455E+00}; - for (int i=0; i<10; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - constexpr FLT c0[] = {3.7794653219809712E+04, 3.4782300224660814E+07, 1.6188020733727572E+09, 1.7196758809615025E+10, 6.3754384857724686E+10, 9.7196447559193588E+10, 6.3754384857724686E+10, 1.7196758809615013E+10, 1.6188020733727574E+09, 3.4782300224660836E+07, 3.7794653219808912E+04}; - constexpr FLT c1[] = {1.8969206922085886E+05, 8.4769319065313712E+07, 2.4230555767723413E+09, 1.5439732722639107E+10, 2.7112836839612331E+10, 7.5382856415600940E-06, -2.7112836839612324E+10, -1.5439732722639109E+10, -2.4230555767723413E+09, -8.4769319065313712E+07, -1.8969206922085691E+05}; - constexpr FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266618E+09, 4.7070559561237240E+09, -1.2448027572952247E+09, -1.0161446790279316E+10, -1.2448027572952359E+09, 4.7070559561237249E+09, 1.5259983101266608E+09, 9.2050522922791883E+07, 4.2138380313901132E+05}; - constexpr FLT c3[] = {5.4814313598122029E+05, 5.8085130777589604E+07, 4.9484006166551131E+08, 1.6222124676641059E+08, -2.0440440381345210E+09, 1.6029666825264191E-05, 2.0440440381345406E+09, -1.6222124676640612E+08, -4.9484006166551065E+08, -5.8085130777589574E+07, -5.4814313598121749E+05}; - constexpr FLT c4[] = {4.6495183529254969E+05, 2.3067199578027170E+07, 6.9832590192482471E+07, -2.2024799260683393E+08, -1.2820270942588173E+08, 5.1017181199129957E+08, -1.2820270942587103E+08, -2.2024799260683718E+08, 6.9832590192482680E+07, 2.3067199578027181E+07, 4.6495183529254753E+05}; - constexpr FLT c5[] = {2.7021781043532956E+05, 5.6764510325100170E+06, -5.5650761736746123E+06, -3.9907385617899098E+07, 7.2453390663685441E+07, 1.3807321808330796E-06, -7.2453390663686499E+07, 3.9907385617896959E+07, 5.5650761736744791E+06, -5.6764510325100273E+06, -2.7021781043532840E+05}; - constexpr FLT c6[] = {1.0933249308680632E+05, 6.9586821127988759E+05, -3.6860240321936086E+06, 2.7428169457744057E+06, 8.3392008440658972E+06, -1.6402201025049815E+07, 8.3392008440622678E+06, 2.7428169457778567E+06, -3.6860240321934861E+06, 6.9586821127989655E+05, 1.0933249308680571E+05}; - constexpr FLT c7[] = {3.0203516161820731E+04, -3.6879059542738614E+04, -4.1141031216769724E+05, 1.4111389975281695E+06, -1.5914376635274226E+06, 6.7631682826831895E-06, 1.5914376635404355E+06, -1.4111389975219201E+06, 4.1141031216798135E+05, 3.6879059542753101E+04, -3.0203516161820640E+04}; - constexpr FLT c8[] = {5.1670143574923986E+03, -2.8613147115359603E+04, 4.3560195427027051E+04, 4.8438679581734432E+04, -2.5856630639957223E+05, 3.7994883866286115E+05, -2.5856630639708077E+05, 4.8438679579228658E+04, 4.3560195427174098E+04, -2.8613147115353891E+04, 5.1670143574923814E+03}; - constexpr FLT c9[] = {3.0888018539742438E+02, -3.7949446187486474E+03, 1.4313303205130735E+04, -2.6681600236165083E+04, 2.3856005159699442E+04, -1.9072153968212169E-06, -2.3856005160079862E+04, 2.6681600234262976E+04, -1.4313303204940523E+04, 3.7949446187568205E+03, -3.0888018539723868E+02}; - constexpr FLT c10[] = {-8.3747489794178762E+01, 1.1948077481430271E+02, 4.8528498043145930E+02, -2.5024391100070475E+03, 5.3511195380863319E+03, -6.7655484103934950E+03, 5.3511195323636521E+03, -2.5024391101798296E+03, 4.8528498086337265E+02, 1.1948077483184566E+02, -8.3747489794339316E+01}; - constexpr FLT c11[] = {-2.2640047135393669E+01, 9.0840898559070766E+01, -2.1597187557069051E+02, 3.1511228970473707E+02, -2.4856618213020064E+02, -2.0962600056762836E-06, 2.4856618232531096E+02, -3.1511228707801843E+02, 2.1597187541459934E+02, -9.0840898577362736E+01, 2.2640047135479467E+01}; - constexpr FLT c12[] = {-1.6306382885603201E+00, 7.3325946574893264E+00, -2.3241017691629008E+01, 5.1715493346619120E+01, -8.2673008978082819E+01, 9.6489716906321945E+01, -8.2673008978083388E+01, 5.1715493276466965E+01, -2.3241017744243891E+01, 7.3325946602297218E+00, -1.6306382886202573E+00}; - for (int i=0; i<11; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - constexpr FLT c0[] = {6.1722991679853279E+04, 8.4789650417103827E+07, 5.4431675199498749E+09, 7.8788892335272308E+10, 4.0355760945670074E+11, 8.8071481911347998E+11, 8.8071481911348035E+11, 4.0355760945670081E+11, 7.8788892335272507E+10, 5.4431675199498901E+09, 8.4789650417103752E+07, 6.1722991679871782E+04}; - constexpr FLT c1[] = {3.2561466099406185E+05, 2.2112758120210630E+08, 8.9911609880089836E+09, 8.3059508064200958E+10, 2.3965569143469873E+11, 1.6939286803305209E+11, -1.6939286803305209E+11, -2.3965569143469867E+11, -8.3059508064201111E+10, -8.9911609880090008E+09, -2.2112758120210621E+08, -3.2561466099404270E+05}; - constexpr FLT c2[] = {7.6621098001581512E+05, 2.6026568260310283E+08, 6.4524338253008652E+09, 3.3729904113826836E+10, 2.8555202212474079E+10, -6.8998572040731476E+10, -6.8998572040731461E+10, 2.8555202212474102E+10, 3.3729904113826820E+10, 6.4524338253008747E+09, 2.6026568260310283E+08, 7.6621098001583782E+05}; - constexpr FLT c3[] = {1.0657807616803222E+06, 1.8144472126891005E+08, 2.5524827004349880E+09, 5.2112383911371851E+09, -1.0268350564014641E+10, -1.4763245309081160E+10, 1.4763245309081381E+10, 1.0268350564014679E+10, -5.2112383911371050E+09, -2.5524827004349866E+09, -1.8144472126890993E+08, -1.0657807616803094E+06}; - constexpr FLT c4[] = {9.7829638830158766E+05, 8.2222351241520002E+07, 5.5676911894064677E+08, -4.8739037675425845E+08, -2.7153428193078089E+09, 2.5627633609246616E+09, 2.5627633609247270E+09, -2.7153428193078089E+09, -4.8739037675429344E+08, 5.5676911894064772E+08, 8.2222351241519988E+07, 9.7829638830161223E+05}; - constexpr FLT c5[] = {6.2536876825113979E+05, 2.4702814073680259E+07, 4.1488431554846764E+07, -2.9274790542417943E+08, 1.0742154109192364E+08, 6.2185168968026125E+08, -6.2185168968025279E+08, -1.0742154109186378E+08, 2.9274790542422217E+08, -4.1488431554844894E+07, -2.4702814073680248E+07, -6.2536876825112430E+05}; - constexpr FLT c6[] = {2.8527714307528501E+05, 4.6266378435690925E+06, -1.0665598090789001E+07, -2.6048960239884529E+07, 9.1597254427304730E+07, -5.9794495983325504E+07, -5.9794495983230442E+07, 9.1597254427350238E+07, -2.6048960239922173E+07, -1.0665598090794679E+07, 4.6266378435690831E+06, 2.8527714307530370E+05}; - constexpr FLT c7[] = {9.2873647411234633E+04, 3.6630046787437343E+05, -3.1271047224703613E+06, 4.8612412939389814E+06, 3.3820440907783178E+06, -1.6880127953644276E+07, 1.6880127953794900E+07, -3.3820440907782884E+06, -4.8612412938910574E+06, 3.1271047224760642E+06, -3.6630046787425788E+05, -9.2873647411217215E+04}; - constexpr FLT c8[] = {2.0817947751046311E+04, -5.5660303410283603E+04, -1.9519783923352187E+05, 1.0804817251249440E+06, -1.8264985852847320E+06, 9.7602844964054180E+05, 9.7602844964026869E+05, -1.8264985852578641E+06, 1.0804817251242315E+06, -1.9519783923298802E+05, -5.5660303410281354E+04, 2.0817947751063894E+04}; - constexpr FLT c9[] = {2.7986023314783351E+03, -1.9404411093657811E+04, 4.3922625001185028E+04, -7.6450317330166517E+03, -1.5273911976404343E+05, 3.3223441450907954E+05, -3.3223441450755787E+05, 1.5273911981578072E+05, 7.6450317512768770E+03, -4.3922624998712294E+04, 1.9404411093676386E+04, -2.7986023314643107E+03}; - constexpr FLT c10[] = {6.7849020474217255E+01, -1.7921351307610907E+03, 8.4980694701237535E+03, -1.9742624848712727E+04, 2.4620674811515193E+04, -1.1676544936917096E+04, -1.1676544845699163E+04, 2.4620674862652242E+04, -1.9742624819688928E+04, 8.4980694644226842E+03, -1.7921351307503089E+03, 6.7849020488654887E+01}; - constexpr FLT c11[] = {-5.4577020998540995E+01, 1.3637112871144197E+02, 4.5513617165591533E+01, -1.1174001347694452E+03, 3.2018768920645603E+03, -5.0580352089258022E+03, 5.0580351705274497E+03, -3.2018769484133886E+03, 1.1174001005075061E+03, -4.5513609907370189E+01, -1.3637112869192950E+02, 5.4577021011650153E+01}; - constexpr FLT c12[] = {-1.0538365872663764E+01, 4.6577222493036992E+01, -1.2606964247581806E+02, 2.1881090265912360E+02, -2.3273404104747246E+02, 1.0274271612440927E+02, 1.0274271612440242E+02, -2.3273400063947102E+02, 2.1881092482740195E+02, -1.2606964693052080E+02, 4.6577222495229805E+01, -1.0538365860486415E+01}; - constexpr FLT c13[] = {-4.6087004138254672E-01, 2.5969759057927089E+00, -9.6946928123584506E+00, 2.4990051638288470E+01, -4.6013914134428035E+01, 6.2056955095902744E+01, -6.2056967309552682E+01, 4.6013924603270830E+01, -2.4990037679831403E+01, 9.6946951024178141E+00, -2.5969758989770559E+00, 4.6087004739949022E-01}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - constexpr FLT c0[] = {9.8715725867495858E+04, 1.9828875496808127E+08, 1.7196758809615005E+10, 3.3083776881353601E+11, 2.2668873993375454E+12, 6.7734720591167598E+12, 9.6695220682534863E+12, 6.7734720591167490E+12, 2.2668873993375454E+12, 3.3083776881353540E+11, 1.7196758809615013E+10, 1.9828875496807912E+08, 9.8715725867495596E+04}; - constexpr FLT c1[] = {5.4491110456935561E+05, 5.4903670125539398E+08, 3.0879465445278194E+10, 3.9588436413399976E+11, 1.6860562536749780E+12, 2.4256447893117891E+12, 5.2271652473787576E-04, -2.4256447893117861E+12, -1.6860562536749771E+12, -3.9588436413399896E+11, -3.0879465445278202E+10, -5.4903670125538874E+08, -5.4491110456935479E+05}; - constexpr FLT c2[] = {1.3504711883426080E+06, 6.9286979077463174E+08, 2.4618123595484570E+10, 1.9493985627722617E+11, 3.9422703517046405E+11, -1.8678883613919846E+11, -8.5538079834550037E+11, -1.8678883613919666E+11, 3.9422703517046375E+11, 1.9493985627722595E+11, 2.4618123595484570E+10, 6.9286979077462602E+08, 1.3504711883426073E+06}; - constexpr FLT c3[] = {1.9937206140846505E+06, 5.2512029493766004E+08, 1.1253303793811764E+10, 4.6205527735932259E+10, -1.1607472377982828E+10, -1.6305241755642276E+11, 1.6137900538478137E-04, 1.6305241755642496E+11, 1.1607472377982767E+10, -4.6205527735932159E+10, -1.1253303793811754E+10, -5.2512029493765628E+08, -1.9937206140846501E+06}; - constexpr FLT c4[] = {1.9607419630386413E+06, 2.6425362558103913E+08, 3.1171259341747184E+09, 2.9839860297840395E+09, -1.9585031917561905E+10, -5.0666917387060509E+09, 3.6568794485482040E+10, -5.0666917387052479E+09, -1.9585031917561382E+10, 2.9839860297839293E+09, 3.1171259341747251E+09, 2.6425362558103746E+08, 1.9607419630386424E+06}; - constexpr FLT c5[] = {1.3593773865640303E+06, 9.1556445104158297E+07, 4.7074012944133645E+08, -1.1192579335656993E+09, -2.1090780087868536E+09, 5.2270306737954664E+09, 5.5914317801530834E-04, -5.2270306737946453E+09, 2.1090780087878797E+09, 1.1192579335657849E+09, -4.7074012944133860E+08, -9.1556445104157880E+07, -1.3593773865640303E+06}; - constexpr FLT c6[] = {6.8417206432039291E+05, 2.1561705510027312E+07, 7.5785249893027432E+06, -2.7456096030220407E+08, 3.4589095671070045E+08, 4.0256106808935356E+08, -1.0074306926604354E+09, 4.0256106809054130E+08, 3.4589095671009880E+08, -2.7456096030236250E+08, 7.5785249893008731E+06, 2.1561705510027334E+07, 6.8417206432039256E+05}; - constexpr FLT c7[] = {2.5248269397037590E+05, 3.0985559672617475E+06, -1.1816517087615140E+07, -8.2958498769974122E+06, 8.0546642347458601E+07, -1.0594657799513456E+08, 2.0249720264016184E-04, 1.0594657799514198E+08, -8.0546642347324282E+07, 8.2958498771580132E+06, 1.1816517087620620E+07, -3.0985559672620827E+06, -2.5248269397037590E+05}; - constexpr FLT c8[] = {6.7530100970876185E+04, 1.2373362326675311E+05, -2.1245597183288219E+06, 5.1047323238642653E+06, -1.4139444406972022E+06, -1.1818267556148527E+07, 2.0121548578311723E+07, -1.1818267556689126E+07, -1.4139444399964837E+06, 5.1047323237335468E+06, -2.1245597183262822E+06, 1.2373362326715943E+05, 6.7530100970876825E+04}; - constexpr FLT c9[] = {1.2421368748960511E+04, -5.0576243646858849E+04, -4.8878193436522284E+04, 6.5307896871419600E+05, -1.5497610128521242E+06, 1.5137725913425679E+06, 9.4288709689637382E-06, -1.5137725926086102E+06, 1.5497610130712469E+06, -6.5307896859246108E+05, 4.8878193441087336E+04, 5.0576243646517250E+04, -1.2421368748960882E+04}; - constexpr FLT c10[] = {1.2904654687548632E+03, -1.1169946054771519E+04, 3.3275109715936509E+04, -3.1765222282529230E+04, -5.9810982046625119E+04, 2.2355863065128919E+05, -3.1083591717381903E+05, 2.2355863453495159E+05, -5.9810982317515191E+04, -3.1765222420737289E+04, 3.3275109716627514E+04, -1.1169946054393644E+04, 1.2904654687550840E+03}; - constexpr FLT c11[] = {-1.9043622268214964E+01, -6.8296542209517031E+02, 4.2702512258593224E+03, -1.2165497344048174E+04, 1.9423733117203814E+04, -1.6010024763745962E+04, 3.4546242756821764E-04, 1.6010021562009399E+04, -1.9423732921465795E+04, 1.2165497485154361E+04, -4.2702512258593424E+03, 6.8296542155861471E+02, 1.9043622268233225E+01}; - constexpr FLT c12[] = {-3.0093984466084923E+01, 9.8972865759901183E+01, -9.7437038386122609E+01, -3.5079929976821143E+02, 1.5699249129925884E+03, -3.1287450613413444E+03, 3.8692192717886201E+03, -3.1287461388880197E+03, 1.5699252721748373E+03, -3.5079941874733129E+02, -9.7437038807041006E+01, 9.8972866294818274E+01, -3.0093984465708520E+01}; - constexpr FLT c13[] = {-4.3050286012574066E+00, 2.1108975856232256E+01, -6.4297196943170974E+01, 1.2922884719917388E+02, -1.6991815434264092E+02, 1.2654996803592717E+02, -1.3650372630766216E-04, -1.2655097304483594E+02, 1.6991801475807023E+02, -1.2922895886683040E+02, 6.4297199778482565E+01, -2.1108976173160116E+01, 4.3050286010444170E+00}; - constexpr FLT c14[] = {-1.0957333734356203E-01, 7.2949328697697935E-01, -3.4300803257592030E+00, 1.0470037850609911E+01, -2.2292132783546631E+01, 3.4570970759468082E+01, -3.9923502981338281E+01, 3.4573363471454584E+01, -2.2292171023236033E+01, 1.0470076090299283E+01, -3.4300793014818574E+00, 7.2949361239845723E-01, -1.0957333723937021E-01}; - for (int i=0; i<13; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - constexpr FLT c0[] = {1.5499533202966311E+05, 4.4723032442444772E+08, 5.1495083701694801E+10, 1.2904576022918081E+12, 1.1534950432785514E+13, 4.5650102198520523E+13, 8.8830582190032719E+13, 8.8830582190032734E+13, 4.5650102198520523E+13, 1.1534950432785541E+13, 1.2904576022918088E+12, 5.1495083701695160E+10, 4.4723032442444867E+08, 1.5499533202970124E+05}; - constexpr FLT c1[] = {8.9188339002980455E+05, 1.3065352538728638E+09, 9.9400185225815598E+10, 1.7136059013402412E+12, 1.0144146621675834E+13, 2.3034036018490723E+13, 1.4630967270448885E+13, -1.4630967270448867E+13, -2.3034036018490715E+13, -1.0144146621675846E+13, -1.7136059013402415E+12, -9.9400185225815979E+10, -1.3065352538728662E+09, -8.9188339002979419E+05}; - constexpr FLT c2[] = {2.3170473769379673E+06, 1.7532505043698251E+09, 8.6523535958354309E+10, 9.7455289065487476E+11, 3.2977972139362329E+12, 1.7874626001697834E+12, -6.1480918082633936E+12, -6.1480918082634014E+12, 1.7874626001697737E+12, 3.2977972139362251E+12, 9.7455289065487329E+11, 8.6523535958354599E+10, 1.7532505043698282E+09, 2.3170473769380408E+06}; - constexpr FLT c3[] = {3.6089249230396431E+06, 1.4278058213962200E+09, 4.4296625537022446E+10, 2.9466624630419830E+11, 3.1903621584503467E+11, -9.8834691411254578E+11, -1.1072264714919094E+12, 1.1072264714919380E+12, 9.8834691411255481E+11, -3.1903621584503326E+11, -2.9466624630419788E+11, -4.4296625537022636E+10, -1.4278058213962224E+09, -3.6089249230396668E+06}; - constexpr FLT c4[] = {3.7733555140851745E+06, 7.8376718099107444E+08, 1.4443117772349586E+10, 4.3197433307418678E+10, -7.6585042240583893E+10, -1.8569640140762125E+11, 2.0385335192658521E+11, 2.0385335192658505E+11, -1.8569640140762244E+11, -7.6585042240577591E+10, 4.3197433307418831E+10, 1.4443117772349697E+10, 7.8376718099107611E+08, 3.7733555140852574E+06}; - constexpr FLT c5[] = {2.8079157920112340E+06, 3.0340753492383713E+08, 2.9498136661747241E+09, -6.2820200387946582E+08, -2.2372008390623741E+10, 1.5217518660587065E+10, 4.0682590266890762E+10, -4.0682590266874344E+10, -1.5217518660581593E+10, 2.2372008390624836E+10, 6.2820200387926054E+08, -2.9498136661747794E+09, -3.0340753492383808E+08, -2.8079157920112382E+06}; - constexpr FLT c6[] = {1.5361613559533129E+06, 8.3513615594416931E+07, 3.0077547202709264E+08, -1.3749596754065564E+09, -6.6733027297578251E+08, 5.9590333632812872E+09, -4.3025685566868906E+09, -4.3025685566947279E+09, 5.9590333632843285E+09, -6.6733027297604084E+08, -1.3749596754066198E+09, 3.0077547202708143E+08, 8.3513615594416305E+07, 1.5361613559533581E+06}; - constexpr FLT c7[] = {6.2759409419593017E+05, 1.5741723594963871E+07, -1.5632610223386128E+07, -1.9294824907063219E+08, 4.4643806532504034E+08, 1.5178998384579189E+07, -9.6771139891231704E+08, 9.6771139892423606E+08, -1.5178998381071322E+07, -4.4643806533015347E+08, 1.9294824907069016E+08, 1.5632610223408137E+07, -1.5741723594963046E+07, -6.2759409419590794E+05}; - constexpr FLT c8[] = {1.9151404903933618E+05, 1.7156606891565623E+06, -9.7733523156695794E+06, 4.2982266232611798E+06, 5.1660907884888940E+07, -1.1279400211171694E+08, 6.4701089576848499E+07, 6.4701089570801638E+07, -1.1279400210612530E+08, 5.1660907893511616E+07, 4.2982266235306170E+06, -9.7733523156822342E+06, 1.7156606891565854E+06, 1.9151404903936735E+05}; - constexpr FLT c9[] = {4.2715272622844263E+04, -2.2565910611002505E+03, -1.1769776156928577E+06, 4.0078399906352242E+06, -3.8951858073074366E+06, -5.0944610789569877E+06, 1.6765992441849992E+07, -1.6765992434448514E+07, 5.0944610797360903E+06, 3.8951858063335577E+06, -4.0078399906595708E+06, 1.1769776157202481E+06, 2.2565910608803192E+03, -4.2715272622819932E+04}; - constexpr FLT c10[] = {6.4806786522801558E+03, -3.5474227032715331E+04, 1.8237100734263218E+04, 3.0934714642964909E+05, -1.0394703930801603E+06, 1.4743920316337310E+06, -7.3356881642929500E+05, -7.3356882324020052E+05, 1.4743920364765557E+06, -1.0394703915764539E+06, 3.0934714676135289E+05, 1.8237100683125096E+04, -3.5474227032952876E+04, 6.4806786523017845E+03}; - constexpr FLT c11[] = {4.9913632908494827E+02, -5.5416668522806276E+03, 2.0614058722611946E+04, -3.2285139157855901E+04, -5.3099566255893524E+03, 1.1559000150525174E+05, -2.2569743273246771E+05, 2.2569743457059452E+05, -1.1559000428242185E+05, 5.3099542679931265E+03, 3.2285138893125553E+04, -2.0614058670789782E+04, 5.5416668532562171E+03, -4.9913632906264002E+02}; - constexpr FLT c12[] = {-3.3076333188696488E+01, -1.8970588558436827E+02, 1.8160423493169353E+03, -6.3715703265863249E+03, 1.2525624646166696E+04, -1.4199807314837786E+04, 6.4441944019082612E+03, 6.4441857815347785E+03, -1.4199805590763088E+04, 1.2525627375951648E+04, -6.3715703355659844E+03, 1.8160422864600705E+03, -1.8970588672434647E+02, -3.3076333168693779E+01}; - constexpr FLT c13[] = {-1.4394533628062636E+01, 5.7000699174526638E+01, -1.0101142144442984E+02, -3.2954074617159108E+01, 6.1417869930814436E+02, -1.6177306801656998E+03, 2.4593354137960296E+03, -2.4593361954696252E+03, 1.6177288934831954E+03, -6.1417959264939657E+02, 3.2954074617159108E+01, 1.0101142929606195E+02, -5.7000698932570963E+01, 1.4394533639244566E+01}; - constexpr FLT c14[] = {-1.5925952284527973E+00, 8.5113930275160214E+00, -2.8993510636695618E+01, 6.6373557362227814E+01, -1.0329536491693236E+02, 1.0280181071020283E+02, -4.3891122033571499E+01, -4.3893656778687756E+01, 1.0280325289276884E+02, -1.0329444716438918E+02, 6.6373666618482872E+01, -2.8993528390837142E+01, 8.5113926647511526E+00, -1.5925952190335899E+00}; - constexpr FLT c15[] = {1.5984868634272537E-02, 1.2876168577716327E-01, -9.8358742969178536E-01, 3.7710928871122080E+00, -9.4315137784350505E+00, 1.6840408563519507E+01, -2.2308532530501328E+01, 2.2310146222863779E+01, -1.6843058416240989E+01, 9.4311230950209399E+00, -3.7712287769953385E+00, 9.8360653920659347E-01, -1.2876103884046056E-01, -1.5984859595043394E-02}; - for (int i=0; i<14; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - constexpr FLT c0[] = {2.3939707792242090E+05, 9.7700272582690299E+08, 1.4715933396485275E+11, 4.7242424833337236E+12, 5.3987426629953617E+13, 2.7580474290566103E+14, 7.0693378336533425E+14, 9.6196578554477850E+14, 7.0693378336533425E+14, 2.7580474290566153E+14, 5.3987426629953828E+13, 4.7242424833337285E+12, 1.4715933396485275E+11, 9.7700272582690418E+08, 2.3939707792242119E+05}; - constexpr FLT c1[] = {1.4314487885226035E+06, 2.9961416925358462E+09, 3.0273361232748425E+11, 6.8507333793903604E+12, 5.4192702756911016E+13, 1.7551587948105316E+14, 2.1874615668430153E+14, 5.4722295550654096E-02, -2.1874615668430156E+14, -1.7551587948105334E+14, -5.4192702756911172E+13, -6.8507333793903730E+12, -3.0273361232748438E+11, -2.9961416925358448E+09, -1.4314487885226023E+06}; - constexpr FLT c2[] = {3.8829497354762922E+06, 4.2473082696966453E+09, 2.8414312556015533E+11, 4.3688281331121431E+12, 2.1823119508000547E+13, 3.2228098609392133E+13, -2.1833085454691801E+13, -7.3750710225100750E+13, -2.1833085454691875E+13, 3.2228098609392070E+13, 2.1823119508000590E+13, 4.3688281331121470E+12, 2.8414312556015527E+11, 4.2473082696966438E+09, 3.8829497354762908E+06}; - constexpr FLT c3[] = {6.3495763451755792E+06, 3.6841035003733959E+09, 1.5965774278321054E+11, 1.5630338683778213E+12, 3.8749058615819409E+12, -2.7319740087722651E+12, -1.3233342822865350E+13, 1.2682483963161023E-01, 1.3233342822865453E+13, 2.7319740087724204E+12, -3.8749058615819307E+12, -1.5630338683778201E+12, -1.5965774278321042E+11, -3.6841035003733950E+09, -6.3495763451755783E+06}; - constexpr FLT c4[] = {7.0146619045520453E+06, 2.1782897863065763E+09, 5.8897780310148117E+10, 3.1953009601770477E+11, 4.0651527030195397E+08, -1.6379148273275671E+12, -1.1568753137013023E+11, 2.7451653250461045E+12, -1.1568753137006947E+11, -1.6379148273276748E+12, 4.0651527030228132E+08, 3.1953009601770502E+11, 5.8897780310148155E+10, 2.1782897863065772E+09, 7.0146619045520453E+06}; - constexpr FLT c5[] = {5.5580012413990172E+06, 9.2345162185944211E+08, 1.4522950934020031E+10, 2.7025952371212032E+10, -1.2304576967641461E+11, -1.0116752717201025E+11, 3.8517418245450385E+11, 1.3143739157465117E-02, -3.8517418245443384E+11, 1.0116752717219414E+11, 1.2304576967643431E+11, -2.7025952371216137E+10, -1.4522950934020092E+10, -9.2345162185944176E+08, -5.5580012413990181E+06}; - constexpr FLT c6[] = {3.2693972344231815E+06, 2.8610260147425276E+08, 2.2348528403751349E+09, -3.4574515574230409E+09, -1.7480626463581440E+10, 3.1608597465590984E+10, 1.9879262560063576E+10, -6.6148013553869423E+10, 1.9879262560078850E+10, 3.1608597465530212E+10, -1.7480626463573368E+10, -3.4574515574202504E+09, 2.2348528403750744E+09, 2.8610260147425228E+08, 3.2693972344231787E+06}; - constexpr FLT c7[] = {1.4553539959296281E+06, 6.4136842048384696E+07, 1.3622336582072574E+08, -1.2131510424637468E+09, 6.4322366984755766E+08, 4.5078753872548027E+09, -7.1689413747004452E+09, 3.2111361580040181E-03, 7.1689413747369127E+09, -4.5078753874649162E+09, -6.4322366984639454E+08, 1.2131510424612916E+09, -1.3622336582064471E+08, -6.4136842048384838E+07, -1.4553539959296265E+06}; - constexpr FLT c8[] = {4.9358776531681791E+05, 9.7772970960583091E+06, -2.3511574237971250E+07, -1.0142613816625430E+08, 3.9421144217985487E+08, -2.8449115594571364E+08, -5.7549243248595941E+08, 1.1608781630719392E+09, -5.7549243238966489E+08, -2.8449115596289498E+08, 3.9421144214631909E+08, -1.0142613816300942E+08, -2.3511574237913735E+07, 9.7772970960591603E+06, 4.9358776531681628E+05}; - constexpr FLT c9[] = {1.2660319987326709E+05, 7.7519511328105081E+05, -6.5244610661542164E+06, 9.0878257490973976E+06, 2.3116605621149909E+07, -8.7079594477661625E+07, 9.5542733670714021E+07, -3.4623017322338634E-02, -9.5542733658248380E+07, 8.7079594589852452E+07, -2.3116605559600774E+07, -9.0878257518242579E+06, 6.5244610661450867E+06, -7.7519511328086059E+05, -1.2660319987326671E+05}; - constexpr FLT c10[] = {2.3793325531461589E+04, -4.2305332802771904E+04, -5.2884156975031609E+05, 2.5307340145554747E+06, -4.0404175204335153E+06, -1.7519988538994591E+05, 1.0146438798034744E+07, -1.5828545528861172E+07, 1.0146438794496680E+07, -1.7520001842407117E+05, -4.0404175643064296E+06, 2.5307340160591919E+06, -5.2884156977243477E+05, -4.2305332802771285E+04, 2.3793325531458995E+04}; - constexpr FLT c11[] = {2.9741655196857741E+03, -2.0687056403629973E+04, 3.3295507834673197E+04, 1.0661145690364030E+05, -5.6644238449031080E+05, 1.0874811673184116E+06, -9.6561276275880623E+05, -7.6207036577648435E-02, 9.6561275636531680E+05, -1.0874812580259521E+06, 5.6644242612787138E+05, -1.0661145858193116E+05, -3.3295507822185595E+04, 2.0687056403005630E+04, -2.9741655196852739E+03}; - constexpr FLT c12[] = {1.5389176594840404E+02, -2.3864418517811582E+03, 1.0846266965476148E+04, -2.2940053899336592E+04, 1.4780105833703366E+04, 4.2663634529139046E+04, -1.3047650082135458E+05, 1.7468394417865420E+05, -1.3047642955960588E+05, 4.2663569014305380E+04, 1.4780038020101238E+04, -2.2940052498526344E+04, 1.0846266965476338E+04, -2.3864418513602504E+03, 1.5389176594853458E+02}; - constexpr FLT c13[] = {-2.3857631312306911E+01, -1.9651606200276817E+01, 6.4183084244784663E+02, -2.8648428291977302E+03, 6.8249248253356263E+03, -9.7944434082514545E+03, 7.6177566999585488E+03, -4.8285923071218206E-02, -7.6177709934185850E+03, 9.7944219680614005E+03, -6.8249060651693289E+03, 2.8648407633460843E+03, -6.4183085466149657E+02, 1.9651606115081155E+01, 2.3857631312306911E+01}; - constexpr FLT c14[] = {-6.1348505726741482E+00, 2.7872916302350376E+01, -6.5819898558168433E+01, 5.1367134246654771E+01, 1.7214275703496423E+02, -6.9657243183240860E+02, 1.3192259272931558E+03, -1.6054145588281010E+03, 1.3192138654025996E+03, -6.9662907027505264E+02, 1.7212038135392731E+02, 5.1368095701697484E+01, -6.5819904020980715E+01, 2.7872916473063263E+01, -6.1348505738411490E+00}; - constexpr FLT c15[] = {-4.9671584422774523E-01, 3.0617550953446120E+00, -1.1650665638577927E+01, 3.0081331929557447E+01, -5.4030564936801589E+01, 6.6075844179663960E+01, -4.7176211285519123E+01, -3.4313439732287163E-02, 4.7173085818207042E+01, -6.6061100127341888E+01, 5.4056655794367416E+01, -3.0081722612971500E+01, 1.1650665638577902E+01, -3.0617553939307713E+00, 4.9671584448693240E-01}; - constexpr FLT c16[] = {4.3460783761337983E-03, -1.3199934226522787E-02, -1.9412503880258877E-01, 1.1325756464362078E+00, -3.4439944517155450E+00, 7.1653575841078521E+00, -1.1108195405465501E+01, 1.2348789868125033E+01, -1.1088023137785596E+01, 7.0939141360622937E+00, -3.4847592426682690E+00, 1.1324705825441117E+00, -1.9413837699275374E-01, -1.3199908576142469E-02, 4.3460782759542488E-03}; - for (int i=0; i<15; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - constexpr FLT c0[] = {3.6434551345571154E+05, 2.0744705928579516E+09, 4.0355760945670056E+11, 1.6364575388763043E+13, 2.3514830376056566E+14, 1.5192201717462540E+15, 4.9956173084674150E+15, 8.9287666945127440E+15, 8.9287666945127440E+15, 4.9956173084674160E+15, 1.5192201717462542E+15, 2.3514830376056566E+14, 1.6364575388763049E+13, 4.0355760945670068E+11, 2.0744705928579512E+09, 3.6434551345570991E+05}; - constexpr FLT c1[] = {2.2576246485480345E+06, 6.6499571180086479E+09, 8.7873753526056311E+11, 2.5606844387131062E+13, 2.6313738449330162E+14, 1.1495095100701470E+15, 2.1932582707747572E+15, 1.2860244365132608E+15, -1.2860244365132600E+15, -2.1932582707747580E+15, -1.1495095100701462E+15, -2.6313738449330162E+14, -2.5606844387131066E+13, -8.7873753526056299E+11, -6.6499571180086479E+09, -2.2576246485480345E+06}; - constexpr FLT c2[] = {6.3730995546265058E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001667E+14, 3.0749346493041262E+14, 1.0259777520247212E+14, -5.5291976457534244E+14, -5.5291976457534294E+14, 1.0259777520247097E+14, 3.0749346493041212E+14, 1.2398425545001659E+14, 1.7953384130753672E+13, 8.8097248605448987E+11, 9.9060026035198078E+09, 6.3730995546265077E+06}; - constexpr FLT c3[] = {1.0896915393078227E+07, 9.0890343524593887E+09, 5.3565169504010052E+11, 7.3004206720038770E+12, 2.9692333044160145E+13, 1.6051737468109752E+13, -9.1273329108089609E+13, -8.5999306918501562E+13, 8.5999306918502812E+13, 9.1273329108090391E+13, -1.6051737468109348E+13, -2.9692333044160059E+13, -7.3004206720038691E+12, -5.3565169504010046E+11, -9.0890343524593925E+09, -1.0896915393078225E+07}; - constexpr FLT c4[] = {1.2655725616100591E+07, 5.7342804054544220E+09, 2.1822836608899585E+11, 1.8300700858999712E+12, 2.7770431049857900E+12, -8.5034969223848574E+12, -1.2846668467422469E+13, 1.6519076896573322E+13, 1.6519076896573414E+13, -1.2846668467422033E+13, -8.5034969223850078E+12, 2.7770431049858350E+12, 1.8300700858999753E+12, 2.1822836608899594E+11, 5.7342804054544239E+09, 1.2655725616100593E+07}; - constexpr FLT c5[] = {1.0609303958036318E+07, 2.6255609052371716E+09, 6.1673589426039268E+10, 2.6044432099085120E+11, -3.5431628074578119E+11, -1.6077602129631777E+12, 1.5534405614726155E+12, 2.8019935380863682E+12, -2.8019935380852476E+12, -1.5534405614728257E+12, 1.6077602129636682E+12, 3.5431628074579871E+11, -2.6044432099085229E+11, -6.1673589426039368E+10, -2.6255609052371745E+09, -1.0609303958036322E+07}; - constexpr FLT c6[] = {6.6544809363384582E+06, 8.9490403680928528E+08, 1.1882638725190987E+10, 8.1552898137820768E+09, -1.2575562817884897E+11, 2.7074695075942204E+10, 3.9453789461929230E+11, -3.1679644857371918E+11, -3.1679644857384814E+11, 3.9453789461920764E+11, 2.7074695075779831E+10, -1.2575562817882477E+11, 8.1552898137801113E+09, 1.1882638725190844E+10, 8.9490403680928373E+08, 6.6544809363384526E+06}; - constexpr FLT c7[] = {3.1906872142825029E+06, 2.2785946180651915E+08, 1.3744578972811413E+09, -4.3997172592843504E+09, -9.2011130753862667E+09, 3.4690551711764793E+10, -9.4227043392778511E+09, -5.9308465069355759E+10, 5.9308465069781982E+10, 9.4227043396369877E+09, -3.4690551711565643E+10, 9.2011130754329739E+09, 4.3997172592904301E+09, -1.3744578972811375E+09, -2.2785946180652067E+08, -3.1906872142825001E+06}; - constexpr FLT c8[] = {1.1821527096621764E+06, 4.2281234059839748E+07, 2.8723226058752719E+07, -8.3553955857505906E+08, 1.2447304828865275E+09, 2.1955280942222519E+09, -7.0514195727878428E+09, 4.3745141232918625E+09, 4.3745141237316084E+09, -7.0514195722924280E+09, 2.1955280943332024E+09, 1.2447304828901291E+09, -8.3553955857124400E+08, 2.8723226058927339E+07, 4.2281234059842363E+07, 1.1821527096621776E+06}; - constexpr FLT c9[] = {3.3854610744279926E+05, 5.2176984975088174E+06, -2.0677283565109752E+07, -3.5831818967739724E+07, 2.6599346107970935E+08, -3.7992777963644773E+08, -1.3426914477301279E+08, 9.1752051236703849E+08, -9.1752051203046608E+08, 1.3426914449876857E+08, 3.7992777988576066E+08, -2.6599346104854524E+08, 3.5831818969687484E+07, 2.0677283565073233E+07, -5.2176984975085324E+06, -3.3854610744279926E+05}; - constexpr FLT c10[] = {7.3893334077310792E+04, 2.6983804209766653E+05, -3.6415998560216571E+06, 8.4025485866871737E+06, 4.9278860835956605E+06, -5.1437033778820507E+07, 8.7603898248918146E+07, -4.6199497914231867E+07, -4.6199497948197275E+07, 8.7603898697554156E+07, -5.1437033767498761E+07, 4.9278861543586710E+06, 8.4025485891638417E+06, -3.6415998559774463E+06, 2.6983804209732520E+05, 7.3893334077308697E+04}; - constexpr FLT c11[] = {1.1778892113376965E+04, -4.0077190108567142E+04, -1.8372552169915423E+05, 1.3262878389569877E+06, -2.9738540196046322E+06, 1.9493506557541618E+06, 4.1881949490808225E+06, -1.1066749801915919E+07, 1.1066748877418302E+07, -4.1881948928182255E+06, -1.9493507634843190E+06, 2.9738539997848324E+06, -1.3262878392766670E+06, 1.8372552166918706E+05, 4.0077190106849979E+04, -1.1778892113376709E+04}; - constexpr FLT c12[] = {1.2019749667900676E+03, -1.0378455845063749E+04, 2.6333352662141660E+04, 1.7117059675298591E+04, -2.5133289742429825E+05, 6.4713895872015413E+05, -8.1634975674778735E+05, 3.8623909535608569E+05, 3.8623887467451266E+05, -8.1634966479713970E+05, 6.4713897711029404E+05, -2.5133289282677229E+05, 1.7117063267120848E+04, 2.6333352680101594E+04, -1.0378455843660833E+04, 1.2019749667921026E+03}; - constexpr FLT c13[] = {3.1189837631121321E+01, -8.9083493701244504E+02, 4.9454293991649774E+03, -1.3124692742151998E+04, 1.5834795298841136E+04, 6.9608292767098355E+03, -5.9790200829217545E+04, 1.0841735230501879E+05, -1.0841732371809872E+05, 5.9789914960016831E+04, -6.9607435159496199E+03, -1.5834797085523640E+04, 1.3124692295481371E+04, -4.9454294410403490E+03, 8.9083493766674769E+02, -3.1189837632399257E+01}; - constexpr FLT c14[] = {-1.2975319072478742E+01, 1.8283699094028595E+01, 1.7684019694555272E+02, -1.1059902320249000E+03, 3.1998244780238201E+03, -5.5987981589200417E+03, 5.9247600879368474E+03, -2.5988290685215188E+03, -2.5988178806809206E+03, 5.9249852432272892E+03, -5.5987701893187350E+03, 3.1998552445852642E+03, -1.1059895327848767E+03, 1.7684022972243278E+02, 1.8283699179384410E+01, -1.2975319072812146E+01}; - constexpr FLT c15[] = {-2.3155118729306223E+00, 1.1938503369059017E+01, -3.4150537494399323E+01, 4.8897188710734866E+01, 1.5839596560322873E+01, -2.4289147960969117E+02, 6.0143231605823757E+02, -8.8772403477020873E+02, 8.8712611928432557E+02, -6.0139861536721287E+02, 2.4281211991792659E+02, -1.5853729108169823E+01, -4.8898479664625256E+01, 3.4150529001281690E+01, -1.1938504563403686E+01, 2.3155118727038264E+00}; - constexpr FLT c16[] = {-1.5401723836370515E-01, 9.8067787978090881E-01, -4.1900810719931050E+00, 1.2149798852514468E+01, -2.4780790340446881E+01, 3.6014221907804398E+01, -3.4588714991383583E+01, 1.3071629460227753E+01, 1.2883354961750646E+01, -3.4615611348253751E+01, 3.5973877372428277E+01, -2.4777428295844171E+01, 1.2151059619254390E+01, -4.1901237542037384E+00, 9.8067813628521039E-01, -1.5401723766235165E-01}; - constexpr FLT c17[] = {1.1808834947531816E-02, -2.5444032491006262E-02, -1.4707353726716647E-04, 2.5840423001794482E-01, -1.0910598687678679E+00, 2.6514321899473572E+00, -4.5034457705829842E+00, 6.8479728528821520E+00, -6.8634402190500978E+00, 4.4285511554539836E+00, -2.6424773990080204E+00, 1.0878035811535636E+00, -2.5882398584322625E-01, 1.3196868749378181E-04, 2.5444131865017927E-02, -1.1808835384234016E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); - } else - printf("width not implemented!\n"); From 74ccd71834634179d26cd7224788c4994015062d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 31 Jul 2024 18:32:51 -0400 Subject: [PATCH 33/39] fixed cmake --- perftest/cuda/CMakeLists.txt | 9 +++++++-- src/cuda/CMakeLists.txt | 35 ++++++++++++++++++++++------------- 2 files changed, 29 insertions(+), 15 deletions(-) diff --git a/perftest/cuda/CMakeLists.txt b/perftest/cuda/CMakeLists.txt index ba3bde04a..ec95760fb 100644 --- a/perftest/cuda/CMakeLists.txt +++ b/perftest/cuda/CMakeLists.txt @@ -1,5 +1,10 @@ add_executable(cuperftest cuperftest.cu) target_include_directories(cuperftest PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(cuperftest PUBLIC cufinufft) -# file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/bench.sh DESTINATION -# ${CMAKE_CURRENT_BINARY_DIR}) +target_compile_features(cuperftest PRIVATE cxx_std_17) +set_target_properties( + cuperftest + PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 77b86ae77..2b91f91d7 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -1,7 +1,3 @@ -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CXX_EXTENSIONS OFF) -set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - set(PRECISION_INDEPENDENT_SRC precision_independent.cu utils.cpp ${PROJECT_SOURCE_DIR}/contrib/legendre_rule_fast.cpp) @@ -47,8 +43,14 @@ target_include_directories(cufinufft_common_objects set_target_properties( cufinufft_common_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) - + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_common_objects PRIVATE cxx_std_17) +target_compile_options( + cufinufft_common_objects + PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) target_compile_options( cufinufft_common_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) @@ -58,24 +60,31 @@ target_include_directories(cufinufft_objects PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) set_target_properties( cufinufft_objects PROPERTIES POSITION_INDEPENDENT_CODE ${FINUFFT_SHARED_LINKING} - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) +target_compile_features(cufinufft_objects PRIVATE cxx_std_17) target_compile_options( cufinufft_objects PRIVATE $<$:${FINUFFT_CUDA_FLAGS}>) if(FINUFFT_SHARED_LINKING) add_library(cufinufft SHARED $ $) - set_target_properties( - cufinufft PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) else() add_library(cufinufft STATIC $ $) - set_target_properties( - cufinufft PROPERTIES ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) endif() +set_target_properties( + cufinufft + PROPERTIES CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_SEPARABLE_COMPILATION ON + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON + ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) + if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft CUDA::nvToolsExt) From ee28d05c15be75fa6c377e17ecf51ef94e19f902 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 10:45:35 -0400 Subject: [PATCH 34/39] Gcc-9 fixes; Ker size fixed too --- devel/CMakeLists.txt | 24 +++++++++++++----------- examples/CMakeLists.txt | 3 +++ examples/cuda/CMakeLists.txt | 2 +- src/cuda/1d/spreadinterp1d.cuh | 6 +++--- src/cuda/2d/spreadinterp2d.cuh | 8 ++++---- src/cuda/CMakeLists.txt | 1 + test/cuda/CMakeLists.txt | 8 ++++++-- 7 files changed, 31 insertions(+), 21 deletions(-) diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt index 9a376408e..45b9a5989 100644 --- a/devel/CMakeLists.txt +++ b/devel/CMakeLists.txt @@ -2,23 +2,25 @@ project(finufft_devel) # Set the minimum required version of CMake cmake_minimum_required(VERSION 3.5) - # include cpm cmake, downloading it -CPMAddPackage( - NAME benchmark - GITHUB_REPOSITORY google/benchmark - VERSION 1.8.3 - OPTIONS "BENCHMARK_ENABLE_TESTING OFF" - -) +cpmaddpackage( + NAME + benchmark + GITHUB_REPOSITORY + google/benchmark + VERSION + 1.8.3 + OPTIONS + "BENCHMARK_ENABLE_TESTING OFF") -if (benchmark_ADDED) - # patch benchmark target - set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) +if(benchmark_ADDED) + # patch benchmark target + set_target_properties(benchmark PROPERTIES CXX_STANDARD 17) endif() add_executable(foldrescale foldrescale.cpp) target_link_libraries(foldrescale finufft benchmark xsimd) add_executable(padding padding.cpp) +target_compile_features(padding PRIVATE cxx_std_17) target_link_libraries(padding finufft xsimd) target_compile_options(padding PRIVATE -march=native) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 35ac5662c..27b193cd5 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -11,6 +11,7 @@ set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf) foreach(EXAMPLE ${EXAMPLES}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) target_link_libraries(${EXAMPLE} PRIVATE finufft) enable_asan(${EXAMPLE}) endforeach() @@ -18,6 +19,7 @@ endforeach() foreach(EXAMPLE ${EXAMPLES_C}) add_executable(${EXAMPLE} ${EXAMPLE}.c) target_link_libraries(${EXAMPLE} PRIVATE finufft) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() @@ -25,6 +27,7 @@ if(FINUFFT_USE_OPENMP) foreach(EXAMPLE ${EXAMPLES_OPENMP}) add_executable(${EXAMPLE} ${EXAMPLE}.cpp) target_link_libraries(${EXAMPLE} PRIVATE finufft OpenMP::OpenMP_CXX) + target_compile_features(${EXAMPLE} PRIVATE cxx_std_17) enable_asan(${EXAMPLE}) endforeach() endif() diff --git a/examples/cuda/CMakeLists.txt b/examples/cuda/CMakeLists.txt index 0c9dba361..b9742a865 100644 --- a/examples/cuda/CMakeLists.txt +++ b/examples/cuda/CMakeLists.txt @@ -1,4 +1,3 @@ - file(GLOB example_src "*.cpp") foreach(srcfile ${example_src}) @@ -7,4 +6,5 @@ foreach(srcfile ${example_src}) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} cufinufft) + target_compile_features(${executable} PRIVATE cxx_std_17) endforeach() diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh index 56493ef73..72c776c06 100644 --- a/src/cuda/1d/spreadinterp1d.cuh +++ b/src/cuda/1d/spreadinterp1d.cuh @@ -24,7 +24,7 @@ __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex *c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation to reduce stack usage #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; @@ -109,7 +109,7 @@ __global__ void spread_1d_subprob( // dynamic stack allocation #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; @@ -160,7 +160,7 @@ __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex *c, T es_c, T es_beta, T sigma, const int *idxnupts) { // dynamic stack allocation #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns); auto *__restrict__ ker1 = ker; #else T ker1[MAX_NSPREAD]; diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh index 03da3ed8a..53a243e7e 100644 --- a/src/cuda/2d/spreadinterp2d.cuh +++ b/src/cuda/2d/spreadinterp2d.cuh @@ -20,7 +20,7 @@ __global__ void spread_2d_nupts_driven( const T *x, const T *y, const cuda_complex *c, cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -137,7 +137,7 @@ __global__ void spread_2d_subprob( const int N = (bin_size_x + rounded_ns) * (bin_size_y + rounded_ns); #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -211,7 +211,7 @@ __global__ void interp_2d_nupts_driven( const T *x, const T *y, cuda_complex *c, const cuda_complex *fw, int M, int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts) { #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else @@ -265,7 +265,7 @@ __global__ void interp_2d_subprob( cuda_complex *fwshared = (cuda_complex *)sharedbuf; #if ALLOCA_SUPPORTED - auto ker = (T *)alloca(sizeof(T) * ns * 3); + auto ker = (T *)alloca(sizeof(T) * ns * 2); auto *__restrict__ ker1 = ker; auto *__restrict__ ker2 = ker + ns; #else diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt index 2b91f91d7..ae9431c31 100644 --- a/src/cuda/CMakeLists.txt +++ b/src/cuda/CMakeLists.txt @@ -84,6 +84,7 @@ set_target_properties( CUDA_STANDARD_REQUIRED ON ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}" CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) +target_compile_features(cufinufft PRIVATE cxx_std_17) if(WIN32) target_link_libraries(cufinufft PUBLIC CUDA::cudart CUDA::cufft diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index a74dcdd79..6d93d3f15 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,8 +7,12 @@ foreach(srcfile ${test_src}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) set_target_properties( - ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES - ${FINUFFT_CUDA_ARCHITECTURES}) + ${executable} + PROPERTIES LINKER_LANGUAGE CUDA + CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} + CUDA_STANDARD 17 + CUDA_STANDARD_REQUIRED ON) + target_compile_features(${executable} PRIVATE cxx_std_17) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") From 466ddffe166a505fd37972f39f6555c8d580ffa0 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 16:38:43 -0400 Subject: [PATCH 35/39] windows compatibility tweak; unit testing the 1.25 upsampfact --- include/cufinufft/utils.h | 13 ++++-- src/cuda/spreadinterp.cpp | 2 +- test/cuda/CMakeLists.txt | 75 ++++++++++++++++--------------- test/cuda/cufinufft1d_test.cu | 30 +++++++------ test/cuda/cufinufft2d_test.cu | 37 ++++++++------- test/cuda/cufinufft2dmany_test.cu | 13 +++--- test/cuda/cufinufft3d_test.cu | 38 ++++++++-------- 7 files changed, 113 insertions(+), 95 deletions(-) diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h index b4db528ae..4bfaa801d 100644 --- a/include/cufinufft/utils.h +++ b/include/cufinufft/utils.h @@ -92,7 +92,12 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { #if defined(__CUDACC_VER_MAJOR__) && defined(__CUDACC_VER_MINOR__) #if (__CUDACC_VER_MAJOR__ > 11) || \ (__CUDACC_VER_MAJOR__ == 11 && __CUDACC_VER_MINOR__ >= 3 && __CUDA_ARCH__ >= 600) + #define ALLOCA_SUPPORTED 1 +// windows compatibility +#if __has_include() +#include +#endif #else #define ALLOCA_SUPPORTED 0 #endif @@ -118,8 +123,8 @@ __forceinline__ __device__ auto interval(const int ns, const double x) { */ template -static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexShared( + cuda_complex *address, cuda_complex res) { const auto raw_address = reinterpret_cast(address); atomicAdd(raw_address, res.x); atomicAdd(raw_address + 1, res.y); @@ -131,8 +136,8 @@ static __forceinline__ __device__ void atomicAddComplexShared(cuda_complex *a * on shared memory are supported so we leverage them */ template -static __forceinline__ __device__ void atomicAddComplexGlobal(cuda_complex *address, - cuda_complex res) { +static __forceinline__ __device__ void atomicAddComplexGlobal( + cuda_complex *address, cuda_complex res) { if constexpr ( std::is_same_v, float2> && COMPUTE_CAPABILITY_90_OR_HIGHER) { atomicAdd(address, res); diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp index b01d1c98f..98b5382bc 100644 --- a/src/cuda/spreadinterp.cpp +++ b/src/cuda/spreadinterp.cpp @@ -22,7 +22,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet // Must call before any kernel evals done. // Returns: 0 success, 1, warning, >1 failure (see error codes in utils.h) { - if (upsampfac != 2.0) { // nonstandard sigma + if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 6d93d3f15..04ae83e75 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -7,76 +7,77 @@ foreach(srcfile ${test_src}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) target_link_libraries(${executable} PUBLIC cufinufft m) set_target_properties( - ${executable} - PROPERTIES LINKER_LANGUAGE CUDA - CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES} - CUDA_STANDARD 17 - CUDA_STANDARD_REQUIRED ON) - target_compile_features(${executable} PRIVATE cxx_std_17) + ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES + ${FINUFFT_CUDA_ARCHITECTURES}) message(STATUS "Adding test ${executable}" " with CUDA_ARCHITECTURES=${FINUFFT_CUDA_ARCHITECTURES}" " and INCLUDE=${CUFINUFFT_INCLUDE_DIRS}") endforeach() -function(add_tests PREC REQ_TOL CHECK_TOL) - add_test(NAME cufinufft1d1_test_GM_${PREC} - COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) +function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) + add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft1d1_test_SM_${PREC} - COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d1_test_SM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft1d2_test_GM_${PREC} - COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}) + add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) - add_test(NAME cufinufft2d1_test_GM_${PREC} + add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1_test_SM_${PREC} + add_test(NAME cufinufft2d1_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1many_test_GM_${PREC} + add_test(NAME cufinufft2d1many_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 1 1 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d1many_test_SM_${PREC} + add_test(NAME cufinufft2d1many_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 2 1 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d2many_test_GM_${PREC} + add_test(NAME cufinufft2d2many_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 1 2 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft2d2many_test_SM_${PREC} + add_test(NAME cufinufft2d2many_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2dmany_test 2 2 1e2 2e2 5 0 2e4 ${REQ_TOL} - ${CHECK_TOL} ${PREC}) + ${CHECK_TOL} ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d1_test_GM_${PREC} + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) if(${PREC} STREQUAL "float") - add_test(NAME cufinufft3d1_test_SM_${PREC} + add_test(NAME cufinufft3d1_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 2 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d1_test_block_${PREC} + add_test(NAME cufinufft3d1_test_block_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 4 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) - add_test(NAME cufinufft3d2_test_SM_${PREC} + add_test(NAME cufinufft3d2_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 2 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) endif() - add_test(NAME cufinufft3d2_test_GM_${PREC} + add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} - ${PREC}) + ${PREC} ${UPSAMP}) endfunction() -add_tests(float 1e-5 2e-4) -add_tests(double 1e-12 1e-11) +add_tests(float 1e-5 2e-4 2.0) +add_tests(double 1e-12 1e-11 2.0) +add_tests(float 1e-5 2e-4 1.25) +add_tests(double 1e-8 1e-7 1.25) add_test(NAME cufinufft_public_api COMMAND public_api_test) add_test(NAME cufinufft_makeplan COMMAND test_makeplan) diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu index 05b62025e..dbd6260ac 100644 --- a/test/cuda/cufinufft1d_test.cu +++ b/test/cuda/cufinufft1d_test.cu @@ -17,7 +17,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -88,6 +89,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) opts.gpu_method = method; opts.gpu_maxbatchsize = 1; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, 1, 1}; int ntransf = 1; @@ -178,7 +180,7 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag) } int main(int argc, char *argv[]) { - if (argc != 8) { + if (argc != 9) { fprintf(stderr, "Usage: cufinufft1d_test method type N1 M tol checktol prec\n" "Arguments:\n" " method: One of\n" @@ -188,21 +190,23 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " precision: f or d\n"); + " precision: f or d\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int M = atof(argv[4]); - const double tol = atof(argv[5]); - const double checktol = atof(argv[6]); - const int iflag = 1; - const char prec = argv[7][0]; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int M = atof(argv[4]); + const double tol = atof(argv[5]); + const double checktol = atof(argv[6]); + const int iflag = 1; + const char prec = argv[7][0]; + const double upsampfac = atof(argv[8]); if (prec == 'f') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, M, tol, checktol, iflag); + return run_test(method, type, N1, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu index 4157f6230..f3b767f2e 100644 --- a/test/cuda/cufinufft2d_test.cu +++ b/test/cuda/cufinufft2d_test.cu @@ -18,7 +18,8 @@ using cufinufft::utils::infnorm; template -int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag) { +int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int iflag, + double upsampfac) { std::cout << std::scientific << std::setprecision(3); thrust::host_vector x(M), y(M); @@ -88,9 +89,9 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int opts.gpu_method = method; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, 1}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, 1}; + int ntransf = 1; cudaEventRecord(start); int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -178,7 +179,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int } int main(int argc, char *argv[]) { - if (argc != 9) { + if (argc != 10) { fprintf(stderr, "Usage: cufinufft2d1_test method N1 N2 M tol checktol\n" "Arguments:\n" " method: One of\n" @@ -189,23 +190,25 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int M = atof(argv[5]); - const double tol = atof(argv[6]); - const double checktol = atof(argv[7]); - const char prec = argv[8][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int M = atof(argv[5]); + const double tol = atof(argv[6]); + const double checktol = atof(argv[7]); + const char prec = argv[8][0]; + const double upsampfac = atof(argv[9]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, M, tol, checktol, iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu index b4f3529e1..4afcd97dd 100644 --- a/test/cuda/cufinufft2dmany_test.cu +++ b/test/cuda/cufinufft2dmany_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize, int M, - T tol, T checktol, int iflag) { + T tol, T checktol, int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -93,6 +93,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize opts.gpu_method = method; opts.gpu_maxbatchsize = maxbatchsize; + opts.upsampfac = upsampfac; int nmodes[3] = {N1, N2, 1}; cudaEventRecord(start); @@ -184,7 +185,7 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize } int main(int argc, char *argv[]) { - if (argc != 11) { + if (argc != 12) { fprintf(stderr, "Usage: cufinufft2d1many_test method type N1 N2 ntransf maxbatchsize M " "tol checktol prec\n" @@ -199,7 +200,8 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsampfac: upsampling factor\n"); return 1; } const int method = atoi(argv[1]); @@ -212,14 +214,15 @@ int main(int argc, char *argv[]) { const double tol = atof(argv[8]); const double checktol = atof(argv[9]); const char prec = argv[10][0]; + const double upsampfac = atof(argv[11]); const int iflag = 1; if (prec == 'f') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else if (prec == 'd') return run_test(method, type, N1, N2, ntransf, maxbatchsize, M, tol, checktol, - iflag); + iflag, upsampfac); else return -1; } diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu index 933dda36d..67818c2b2 100644 --- a/test/cuda/cufinufft3d_test.cu +++ b/test/cuda/cufinufft3d_test.cu @@ -19,7 +19,7 @@ using cufinufft::utils::infnorm; template int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T checktol, - int iflag) { + int iflag, double upsampfac) { std::cout << std::scientific << std::setprecision(3); int ier; @@ -94,9 +94,9 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check opts.gpu_method = method; opts.gpu_kerevalmeth = 1; opts.gpu_maxbatchsize = 1; - - int nmodes[3] = {N1, N2, N3}; - int ntransf = 1; + opts.upsampfac = upsampfac; + int nmodes[3] = {N1, N2, N3}; + int ntransf = 1; cudaEventRecord(start); ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts); @@ -190,7 +190,7 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check } int main(int argc, char *argv[]) { - if (argc < 10) { + if (argc != 11) { fprintf(stderr, "Usage: cufinufft3d1_test method type N1 N2 N3 M tol checktol prec\n" "Arguments:\n" @@ -203,24 +203,26 @@ int main(int argc, char *argv[]) { " M: The number of non-uniform points\n" " tol: NUFFT tolerance\n" " checktol: relative error to pass test\n" - " prec: 'f' or 'd' (float/double)\n"); + " prec: 'f' or 'd' (float/double)\n" + " upsamplefac: upsampling factor\n"); return 1; } - const int method = atoi(argv[1]); - const int type = atoi(argv[2]); - const int N1 = atof(argv[3]); - const int N2 = atof(argv[4]); - const int N3 = atof(argv[5]); - const int M = atof(argv[6]); - const double tol = atof(argv[7]); - const double checktol = atof(argv[8]); - const char prec = argv[9][0]; - const int iflag = 1; + const int method = atoi(argv[1]); + const int type = atoi(argv[2]); + const int N1 = atof(argv[3]); + const int N2 = atof(argv[4]); + const int N3 = atof(argv[5]); + const int M = atof(argv[6]); + const double tol = atof(argv[7]); + const double checktol = atof(argv[8]); + const char prec = argv[9][0]; + const double upsampfac = atof(argv[10]); + const int iflag = 1; if (prec == 'f') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else if (prec == 'd') - return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag); + return run_test(method, type, N1, N2, N3, M, tol, checktol, iflag, upsampfac); else return -1; } From fb48ff8d668905bed97eb917e8e18ba4ffd74e4e Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 1 Aug 2024 16:47:12 -0400 Subject: [PATCH 36/39] added forgotten c++17 flag --- test/cuda/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 04ae83e75..d9c5d312b 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -5,7 +5,11 @@ foreach(srcfile ${test_src}) get_filename_component(executable ${executable} NAME) add_executable(${executable} ${srcfile}) target_include_directories(${executable} PUBLIC ${CUFINUFFT_INCLUDE_DIRS}) - target_link_libraries(${executable} PUBLIC cufinufft m) + find_library(MathLib m) + if(MathLib) + target_link_libraries(${executable} PUBLIC cufinufft ${MathLib}) + endif() + target_compile_features(${executable} PUBLIC cxx_std_17) set_target_properties( ${executable} PROPERTIES LINKER_LANGUAGE CUDA CUDA_ARCHITECTURES ${FINUFFT_CUDA_ARCHITECTURES}) From afabb3f979c6a485382d6d1aa5e80a32604021f2 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 2 Aug 2024 17:48:00 -0400 Subject: [PATCH 37/39] Addressing review comments --- include/cufinufft/common.h | 21 ++++++++++---------- include/cufinufft/impl.h | 13 ++++++------ src/cuda/1d/spread1d_wrapper.cu | 10 ++++++++-- src/cuda/2d/spread2d_wrapper.cu | 10 ++++++++-- src/cuda/3d/spread3d_wrapper.cu | 11 ++++++++--- src/cuda/common.cu | 33 +++++++++++-------------------- src/cuda/cufinufft.cu | 26 ++++++++++++------------ src/cuda/precision_independent.cu | 7 ------- 8 files changed, 65 insertions(+), 66 deletions(-) diff --git a/include/cufinufft/common.h b/include/cufinufft/common.h index 33d8a0d86..efa7eb7b1 100644 --- a/include/cufinufft/common.h +++ b/include/cufinufft/common.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -42,26 +43,26 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts); template auto cufinufft_set_shared_memory(V *kernel, const int dim, const cufinufft_plan_t &d_plan) { - int device_id; + /** + * WARNING: this function does not handle cuda errors. The caller should check them. + */ + int device_id{}, shared_mem_per_block{}; cudaGetDevice(&device_id); const auto shared_mem_required = shared_memory_required(dim, d_plan.spopts.nspread, d_plan.opts.gpu_binsizex, d_plan.opts.gpu_binsizey, d_plan.opts.gpu_binsizez); - int shared_mem_per_block{}; - const auto err = cudaDeviceGetAttribute( - &shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - if (err != cudaSuccess) { - return err; - } + cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, + device_id); if (shared_mem_required > shared_mem_per_block) { fprintf(stderr, "Error: Shared memory required per block is %zu bytes, but the device " "supports only %d bytes.\n", shared_mem_required, shared_mem_per_block); - return err; + return FINUFFT_ERR_INSUFFICIENT_SHMEM; } - return cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, - shared_mem_required); + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, + shared_mem_required); + return 0; } } // namespace common diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index c3021a7ff..3a9fd6877 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -121,6 +121,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran d_plan->mu = nmodes[2]; cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); + RETURN_IF_CUDA_ERROR + CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, d_plan->opts.gpu_obinsizex); @@ -136,14 +138,11 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran /* Automatically set GPU method. */ if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster. - * However, in the special case of _double precision_ in _three dimensions_ - * with more than _three digits of precision_, there is note enough shared - * memory for this to work. As a result, we will default to method 1 (GM) in - * this special case. + /* For type 1, we default to method 2 (SM) since this is generally faster + * if there is enough shared memory available. Otherwise, we default to GM. * - * For type 2, we always default to method 1 (GM). */ - + * For type 2, we always default to method 1 (GM). + */ if (type == 2) { d_plan->opts.gpu_method = 1; } else { diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu index 824da42c9..1b2afde7d 100644 --- a/src/cuda/1d/spread1d_wrapper.cu +++ b/src/cuda/1d/spread1d_wrapper.cu @@ -268,7 +268,10 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, @@ -278,7 +281,10 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t *d_plan, int blksize) } } else { for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_1d_subprob, 1, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_1d_subprob<<>>( d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu index 244d25b03..80cf9f8e9 100644 --- a/src/cuda/2d/spread2d_wrapper.cu +++ b/src/cuda/2d/spread2d_wrapper.cu @@ -277,7 +277,10 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, if (d_plan->opts.gpu_kerevalmeth) { for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, @@ -288,7 +291,10 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t *d_plan, } } else { for (int t = 0; t < blksize; t++) { - cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_2d_subprob, 2, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_2d_subprob<<>>( d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu index 4fb2b073d..475a888ac 100644 --- a/src/cuda/3d/spread3d_wrapper.cu +++ b/src/cuda/3d/spread3d_wrapper.cu @@ -280,7 +280,6 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, blocks.x = (threadsPerBlock.x + numbins[0] - 1) / threadsPerBlock.x; blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; - blocks.y = (threadsPerBlock.y + numbins[1] - 1) / threadsPerBlock.y; blocks.z = (threadsPerBlock.z + numbins[2] - 1) / threadsPerBlock.z; ghost_bin_pts_index<<>>( @@ -538,7 +537,10 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); for (int t = 0; t < blksize; t++) { if (d_plan->opts.gpu_kerevalmeth) { - cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, @@ -547,7 +549,10 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t *d_ numbins[0], numbins[1], numbins[2], d_idxnupts); RETURN_IF_CUDA_ERROR } else { - cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan); + if (const auto finufft_err = + cufinufft_set_shared_memory(spread_3d_subprob, 3, *d_plan) != 0) { + return FINUFFT_ERR_INSUFFICIENT_SHMEM; + } RETURN_IF_CUDA_ERROR spread_3d_subprob<<>>( d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, diff --git a/src/cuda/common.cu b/src/cuda/common.cu index 6e7064b25..c7a47eeae 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -249,16 +249,11 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { int shared_mem_per_block{}, device_id{}; switch (dim) { case 1: { - if (opts->gpu_binsizex < 0) { + if (opts->gpu_binsizex == 0) { cudaGetDevice(&device_id); - if (const auto err = cudaGetLastError(); err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - if (const auto err = cudaGetLastError(); err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } + // CUDA error handled by the caller not checking them here. // use 1/6 of the shared memory for the binsize // From experiments on multiple GPUs this gives the best tradeoff. // It is within 90% of the maximum performance for all GPUs tested. @@ -271,7 +266,7 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { opts->gpu_binsizez = 1; } break; case 2: { - if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0) { + if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0) { switch (opts->gpu_method) { case 0: case 2: { @@ -280,16 +275,10 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { } break; case 1: { cudaGetDevice(&device_id); - if (const auto err = cudaGetLastError(); err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } cudaDeviceGetAttribute(&shared_mem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - if (const auto err = cudaGetLastError(); err != cudaSuccess) { - throw std::runtime_error(cudaGetErrorString(err)); - } - const auto binsize = find_bin_size(shared_mem_per_block, dim, ns); + // in 2D 1/6 is too small, it gets slower because of the excessive padding opts->gpu_binsizex = binsize; opts->gpu_binsizey = binsize; } break; @@ -302,19 +291,19 @@ void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { case 0: case 1: case 2: { - if (opts->gpu_binsizex < 0 || opts->gpu_binsizey < 0 || opts->gpu_binsizez < 0) { + if (opts->gpu_binsizex == 0 || opts->gpu_binsizey == 0 || opts->gpu_binsizez == 0) { opts->gpu_binsizex = 16; opts->gpu_binsizey = 16; opts->gpu_binsizez = 2; } } break; case 4: { - opts->gpu_obinsizex = (opts->gpu_obinsizex < 0) ? 8 : opts->gpu_obinsizex; - opts->gpu_obinsizey = (opts->gpu_obinsizey < 0) ? 8 : opts->gpu_obinsizey; - opts->gpu_obinsizez = (opts->gpu_obinsizez < 0) ? 8 : opts->gpu_obinsizez; - opts->gpu_binsizex = (opts->gpu_binsizex < 0) ? 4 : opts->gpu_binsizex; - opts->gpu_binsizey = (opts->gpu_binsizey < 0) ? 4 : opts->gpu_binsizey; - opts->gpu_binsizez = (opts->gpu_binsizez < 0) ? 4 : opts->gpu_binsizez; + opts->gpu_obinsizex = (opts->gpu_obinsizex == 0) ? 8 : opts->gpu_obinsizex; + opts->gpu_obinsizey = (opts->gpu_obinsizey == 0) ? 8 : opts->gpu_obinsizey; + opts->gpu_obinsizez = (opts->gpu_obinsizez == 0) ? 8 : opts->gpu_obinsizez; + opts->gpu_binsizex = (opts->gpu_binsizex == 0) ? 4 : opts->gpu_binsizex; + opts->gpu_binsizey = (opts->gpu_binsizey == 0) ? 4 : opts->gpu_binsizey; + opts->gpu_binsizez = (opts->gpu_binsizez == 0) ? 4 : opts->gpu_binsizez; } break; } } break; diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu index c0066d049..c00bf8eba 100644 --- a/src/cuda/cufinufft.cu +++ b/src/cuda/cufinufft.cu @@ -102,26 +102,26 @@ void cufinufft_default_opts(cufinufft_opts *opts) { // sphinx tag (don't remove): @gpu_defopts_start // data handling opts... - opts->modeord = 0; + opts->modeord = 0; opts->gpu_device_id = 0; // diagnostic opts... opts->gpu_spreadinterponly = 0; // algorithm performance opts... - opts->gpu_method = 0; - opts->gpu_sort = 1; - opts->gpu_kerevalmeth = 1; - opts->upsampfac = 2.0; + opts->gpu_method = 0; + opts->gpu_sort = 1; + opts->gpu_kerevalmeth = 1; + opts->upsampfac = 2.0; opts->gpu_maxsubprobsize = 1024; - opts->gpu_obinsizex = -1; - opts->gpu_obinsizey = -1; - opts->gpu_obinsizez = -1; - opts->gpu_binsizex = -1; - opts->gpu_binsizey = -1; - opts->gpu_binsizez = -1; - opts->gpu_maxbatchsize = 0; - opts->gpu_stream = cudaStreamDefault; + opts->gpu_obinsizex = 0; + opts->gpu_obinsizey = 0; + opts->gpu_obinsizez = 0; + opts->gpu_binsizex = 0; + opts->gpu_binsizey = 0; + opts->gpu_binsizez = 0; + opts->gpu_maxbatchsize = 0; + opts->gpu_stream = cudaStreamDefault; // sphinx tag (don't remove): @gpu_defopts_end } } diff --git a/src/cuda/precision_independent.cu b/src/cuda/precision_independent.cu index 66cc5ca69..b2c0c292f 100644 --- a/src/cuda/precision_independent.cu +++ b/src/cuda/precision_independent.cu @@ -52,13 +52,6 @@ __global__ void map_b_into_subprob_1d(int *d_subprob_to_bin, int *d_subprobstart } } -__global__ void trivial_global_sort_index_1d(int M, int *index) { - for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < M; - i += gridDim.x * blockDim.x) { - index[i] = i; - } -} - /* spreadinterp 2d */ __global__ void calc_subprob_2d(int *bin_size, int *num_subprob, int maxsubprobsize, int numbins) { From c3df5e118f6736e757c0377b94358c0a51d44008 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 2 Aug 2024 17:50:19 -0400 Subject: [PATCH 38/39] Added warning --- src/cuda/common.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cuda/common.cu b/src/cuda/common.cu index c7a47eeae..b19986520 100644 --- a/src/cuda/common.cu +++ b/src/cuda/common.cu @@ -245,6 +245,8 @@ template void cufinufft_setup_binsize(int type, int ns, int dim, cufinufft_opts *opts) { // Marco Barbone 07/26/24. Using the shared memory available on the device, to // determine the optimal binsize for the spreader. + // WARNING: This function does not check for CUDA errors, the caller should check and + // handle them. // TODO: This can still be improved some sizes are hardcoded still int shared_mem_per_block{}, device_id{}; switch (dim) { From 44c523b5dc8c17a98043fd98530ac4193e25490d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 2 Aug 2024 18:18:33 -0400 Subject: [PATCH 39/39] updated changelog --- CHANGELOG | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG b/CHANGELOG index f8fb276f4..cf842bb67 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -64,6 +64,7 @@ V 2.3.0beta (7/24/24) * cuFINUFFT using intrinsics in foldrescale and other places to increase performance * cuFINUFFT using SM90 float2 vector atomicAdd where supported +* cuFINUFFT making default binsize = 0 V 2.2.0 (12/12/23)