From e6d6ac08ceab4a8d2776834d9320b061cdffbf94 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 5 Jun 2024 15:23:28 -0400 Subject: [PATCH 01/35] added git blame ignore revs --- .git-blame-ignore-revs | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .git-blame-ignore-revs diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs new file mode 100644 index 000000000..1e469ec95 --- /dev/null +++ b/.git-blame-ignore-revs @@ -0,0 +1,2 @@ +# Applied clang format to the codebase +884ba427be0c60aa3399d5ea71b0e9e3a7cbf686 \ No newline at end of file From fd683c5dbee6f5238b04fa26a7a0f485bf6889ac Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 6 Jun 2024 18:16:08 -0400 Subject: [PATCH 02/35] spreader-vectorized --- CMakeLists.txt | 47 +- cmake/setupXSIMD.cmake | 19 + devel/CMakeLists.txt | 6 +- devel/padding.cpp | 278 +++++ include/finufft/defs.h | 13 +- src/finufft.cpp | 166 ++- src/ker_horner_allw_loop_constexpr.h | 913 ++++++++++++++ ..._lowupsampfac_horner_allw_loop_constexpr.c | 192 +++ src/simpleinterfaces.cpp | 8 +- src/spreadinterp.cpp | 1055 ++++++++++++----- 10 files changed, 2278 insertions(+), 419 deletions(-) create mode 100644 cmake/setupXSIMD.cmake create mode 100644 devel/padding.cpp create mode 100644 src/ker_horner_allw_loop_constexpr.h create mode 100644 src/ker_lowupsampfac_horner_allw_loop_constexpr.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 347d2f3f8..1626ad35e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,20 +2,24 @@ cmake_minimum_required(VERSION 3.19) project(finufft VERSION 2.2.0 LANGUAGES C CXX) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + set(GNU_LIKE_FRONTENDS AppleClang Clang GNU) if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) # Set custom compiler flags for gcc-compatible compilers - set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG -funroll-loops") - set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG -funroll-loops") + set(FINUFFT_CXX_FLAGS_RELEASE -funroll-loops -ffp-contract=fast) + set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -O3 -g -DNDEBUG ${FINUFFT_CXX_FLAGS_RELEASE}) endif () include(CTest) -if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) - # PowerPC arch does not have -march flag. - set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.") -else () - set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.") +if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) + if (CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64")) + # PowerPC arch does not have -march flag. + set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.") + else () + set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.") + endif () endif () set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)") set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library") @@ -31,7 +35,7 @@ option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disable option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF) option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON) option(FINUFFT_STATIC_LINKING "Whether to link the static FINUFFT library (libfinufft_static)." ON) -option(FINUFFT_BUILD_DEVEL "Whether to build developement executables" OFF) +option(FINUFFT_BUILD_DEVEL "Whether to build development executables" OFF) # sphinx tag (don't remove): @cmake_opts_end if (FINUFFT_USE_CPU) @@ -47,10 +51,11 @@ if (FINUFFT_USE_CPU) set(CPM_DOWNLOAD_VERSION 0.38.0) set(FFTW_VERSION 3.3.10) - + set(XTL_VERSION 0.7.7) + set(XSIMD_VERSION 13.0.0) include(cmake/setupCPM.cmake) include(cmake/setupFFTW.cmake) - + include(cmake/setupXSIMD.cmake) endif () if (FINUFFT_BUILD_MATLAB) @@ -92,8 +97,8 @@ function(enable_asan target) if (CMAKE_CXX_COMPILER_ID IN_LIST FINUFFT_GNU_LIKE_COMPILERS) # Enable only on clang / gcc compilers. - target_compile_options(${target} PRIVATE $<$:-fsanitize=address>) - target_link_options(${target} PRIVATE $<$:-fsanitize=address>) + target_compile_options(${target} PRIVATE $<$:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>) + target_link_options(${target} PRIVATE $<$:-fsanitize=address -fsanitize=undefined -fsanitize=bounds-strict>) endif () endfunction() @@ -119,7 +124,8 @@ endfunction() # Utility function to set finufft compilation options. function(set_finufft_options target) set_property(TARGET ${target} PROPERTY POSITION_INDEPENDENT_CODE ON) - set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 14) + target_compile_features(${target} PRIVATE cxx_std_17) + enable_asan(${target}) target_compile_options(${target} PRIVATE SHELL:$<$:${FINUFFT_ARCH_FLAGS}>) @@ -127,9 +133,9 @@ function(set_finufft_options target) # Enable cx-limited-range on supported compilers target_compile_options(${target} PRIVATE $<$:-fcx-limited-range>) endif () - - target_include_directories(${target} PUBLIC $) - target_include_directories(${target} SYSTEM INTERFACE $) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELEASE}>) + target_compile_options(${target} PRIVATE $<$:${FINUFFT_CXX_FLAGS_RELWITHDEBINFO}>) + target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") if (FINUFFT_USE_OPENMP) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) # there are issues on windows with OpenMP and CMake, so we need to manually add the flags @@ -163,12 +169,13 @@ if (FINUFFT_USE_CPU) target_compile_definitions(finufft_f32 PRIVATE SINGLE) set_finufft_options(finufft_f32) target_link_libraries(finufft_f32 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) + target_link_libraries(finufft_f32 PRIVATE xsimd) add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f64 PRIVATE) set_finufft_options(finufft_f64) target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) - + target_link_libraries(finufft_f64 PRIVATE xsimd) if (WIN32) add_library(finufft_f32_dll OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f32_dll PRIVATE SINGLE dll_EXPORTS FINUFFT_DLL) @@ -193,8 +200,7 @@ if (FINUFFT_USE_CPU) if (NOT WIN32) target_link_libraries(finufft PUBLIC m) endif () - target_include_directories(finufft PUBLIC $) - target_include_directories(finufft SYSTEM INTERFACE $) + target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) set_finufft_options(finufft_static) @@ -203,8 +209,7 @@ if (FINUFFT_USE_CPU) if (NOT WIN32) target_link_libraries(finufft_static PUBLIC m) endif () - target_include_directories(finufft_static PUBLIC $) - target_include_directories(finufft_static SYSTEM INTERFACE $) + target_include_directories(finufft_static PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") file(GLOB FINUFFT_PUBLIC_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/include/finufft*.h") set_target_properties(finufft PROPERTIES PUBLIC_HEADER "${FINUFFT_PUBLIC_HEADERS}") diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake new file mode 100644 index 000000000..303ef2986 --- /dev/null +++ b/cmake/setupXSIMD.cmake @@ -0,0 +1,19 @@ +CPMAddPackage( + NAME findxtl + GIT_REPOSITORY "https://github.com/xtensor-stack/xtl.git" + GIT_TAG ${XTL_VERSION} + EXCLUDE_FROM_ALL YES + GIT_SHALLOW YES + OPTIONS "XTL_DISABLE_EXCEPTIONS YES" +) + +CPMAddPackage( + NAME findxsimd + GIT_REPOSITORY "https://github.com/xtensor-stack/xsimd.git" + GIT_TAG ${XSIMD_VERSION} + EXCLUDE_FROM_ALL YES + GIT_SHALLOW YES + OPTIONS + "XSIMD_SKIP_INSTALL YES" + "XSIMD_ENABLE_XTL_COMPLEX YES" +) diff --git a/devel/CMakeLists.txt b/devel/CMakeLists.txt index 58a750ce4..1f6097daf 100644 --- a/devel/CMakeLists.txt +++ b/devel/CMakeLists.txt @@ -18,5 +18,7 @@ if (benchmark_ADDED) endif() add_executable(foldrescale foldrescale.cpp) -target_link_libraries(foldrescale finufft benchmark) -target_compile_options(foldrescale PRIVATE -mavx2) +target_link_libraries(foldrescale finufft benchmark xsimd) +add_executable(padding padding.cpp) +target_link_libraries(padding finufft xsimd) +target_compile_options(padding PRIVATE -march=native) \ No newline at end of file diff --git a/devel/padding.cpp b/devel/padding.cpp new file mode 100644 index 000000000..844a4db17 --- /dev/null +++ b/devel/padding.cpp @@ -0,0 +1,278 @@ +// +// Created by mbarbone on 5/17/24. +// +#include +#include +#include +#include + +template static constexpr auto BestSIMDHelper(); + +template static constexpr auto GetPaddedSIMDSize(); + +template static uint16_t get_padding(uint16_t ns); + +template static constexpr auto get_padding(); + +template +using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; + +template static constexpr uint16_t min_batch_size(); + +template()> constexpr uint16_t max_batch_size(); + +template static constexpr auto find_optimal_batch_size(); + +// below there is some trickery to obtain the padded SIMD type to vectorize +// the given number of elements. +// improper use will cause the compiler to either throw an error on the recursion depth +// or on older ones... "compiler internal error please report" +// you have been warned. + +template static constexpr auto BestSIMDHelper() { + if constexpr (N % K == 0) { // returns void in the worst case + return xsimd::make_sized_batch{}; + } else { + return BestSIMDHelper> 1)>(); + } +} + +template constexpr uint16_t min_batch_size() { + if constexpr (std::is_void_v>) { + return min_batch_size(); + } else { + return N; + } +}; + +template constexpr uint16_t max_batch_size() { + if constexpr (!std::is_void_v>) { + return max_batch_size(); + } else { + return N; + } +}; + +template static constexpr auto find_optimal_batch_size() { + uint16_t min_iterations = N; + uint16_t optimal_batch_size = 1; + for (uint16_t batch_size = min_batch_size(); batch_size <= xsimd::batch::size; + batch_size *= 2) { + uint16_t iterations = (N + batch_size - 1) / batch_size; + if (iterations < min_iterations) { + min_iterations = iterations; + optimal_batch_size = batch_size; + } + } + return optimal_batch_size; +} + +template static constexpr auto GetPaddedSIMDSize() { + static_assert(N < 128); + return xsimd::make_sized_batch()>::type::size; +} + +template static constexpr auto get_padding() { + constexpr uint16_t width = GetPaddedSIMDSize(); + return ns % width == 0 ? 0 : width - (ns % width); +} + +template +static constexpr auto get_padding_helper(uint16_t runtime_ns) { + if constexpr (ns < 2) { + return 0; + } else { + if (runtime_ns == ns) { + return get_padding(); + } else { + return get_padding_helper(runtime_ns); + } + } +} + +template static uint16_t get_padding(uint16_t ns) { + return get_padding_helper(ns); +} + +template std::ostream &print(T arg) { + typename T::value_type sum = 0; + for (const auto &elem : arg) { + std::cout << elem << " "; + sum += elem; + } + std::cout << "sum is " << sum; + return std::cout; +} + +template constexpr uint16_t po2_in_between() { + std::uint16_t result = 0; + for (auto i = low; i <= high; i <<= 1) { + result++; + } + return result; +} + +template constexpr auto mixed_vectors() { + constexpr auto min_batch = min_batch_size(); + constexpr auto max_batch = max_batch_size(); + // compute all the power of 2 between min_batch and max_batch + + std::array() + 1> batch_sizes{1}; + for (uint16_t i = 1; i < batch_sizes.size(); i++) { + batch_sizes[i] = min_batch << (i - 1); + } + print(batch_sizes); + std::array chosen_batch_sizes{0}, dp{N + 1}; + dp[0] = 0; // 0 amount requires 0 coins + + for (uint16_t i = 0; i < N + 1; ++i) { + for (const auto batch_size : batch_sizes) { + if (batch_size <= i && dp[i - batch_size] + 1 < dp[i]) { + dp[i] = dp[i - batch_size] + 1; + chosen_batch_sizes[i] = batch_size; + } + } + } + // Build the sequence of coins that fit in N + std::array sequence{0}; + auto index = 0; + for (int i = N; i > 0; i -= chosen_batch_sizes[i]) { + sequence[index++] = chosen_batch_sizes[i]; + } + // return the not zero elements in the sequence + return sequence; +} + +int main(int argc, char *argv[]) { + std::cout << "Min batch size for single precision is " + << uint64_t(min_batch_size()) << std::endl; + std::cout << "Max batch size for single precision is " + << uint64_t(max_batch_size()) << std::endl; + std::cout << "Min batch size for double precision is " + << uint64_t(min_batch_size()) << std::endl; + std::cout << "Max batch size for double precision is " + << uint64_t(max_batch_size()) << std::endl; + + std::cout << "Best SIMD single precision" << std::endl; + std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) + << std::endl; + + std::cout << "Best SIMD double precision" << std::endl; + std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 8 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 12 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 16 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 20 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 24 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 28 << " is " << uint64_t(BestSIMD::size) + << std::endl; + std::cout << "SIMD for " << 32 << " is " << uint64_t(BestSIMD::size) + << std::endl; + + std::cout << "Padded SIMD single precision" << std::endl; + std::cout << "Padded SIMD for " << 4 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 6 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 10 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 12 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 15 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 18 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 22 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 26 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 30 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 32 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + + std::cout << "Padded SIMD double precision" << std::endl; + std::cout << "Padded SIMD for " << 4 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 6 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 10 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 12 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 15 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 18 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 22 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 26 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 30 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + std::cout << "Padded SIMD for " << 32 << " is " + << uint64_t(GetPaddedSIMDSize()) << std::endl; + + std::cout << "single precision" << std::endl; + for (auto i = 2; i < 16; i++) { + std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding(i * 2)) + << std::endl; + } + + std::cout << "double precision" << std::endl; + for (auto i = 2; i < 16; i++) { + std::cout << "Padding for " << i * 2 << " is " << uint64_t(get_padding(i * 2)) + << std::endl; + } + + std::cout << "single precision" << std::endl; + std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 11 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 13 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 15 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "double precision" << std::endl; + std::cout << "Padding for " << 3 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 5 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 7 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 9 * 2 << " is " << uint64_t(get_padding()) + << std::endl; + std::cout << "Padding for " << 11 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 13 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + std::cout << "Padding for " << 15 * 2 << " is " + << uint64_t(get_padding()) << std::endl; + + return 0; +} \ No newline at end of file diff --git a/include/finufft/defs.h b/include/finufft/defs.h index c2a5c48f7..df7ad4249 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -24,7 +24,8 @@ // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. -#define BIGINT int64_t +#define BIGINT int64_t +#define UBIGINT uint64_t // Precision-independent real and complex types, for private lib/test compile #ifdef SINGLE #define FLT float @@ -40,10 +41,16 @@ // this avoids the use of macros to implement functions #if defined(_MSC_VER) #define FINUFFT_ALWAYS_INLINE __forceinline +#define FINUFFT_NEVER_INLINE __declspec(noinline) +#define FINUFFT_RESTRICT __restrict #elif defined(__GNUC__) || defined(__clang__) #define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline +#define FINUFFT_NEVER_INLINE __attribute__((noinline)) +#define FINUFFT_RESTRICT __restrict__ #else #define FINUFFT_ALWAYS_INLINE inline +#define FINUFFT_NEVER_INLINE +#define FINUFFT_RESTRICT #endif // ------------- Library-wide algorithm parameter settings ---------------- @@ -51,6 +58,10 @@ // Library version (is a string) #define FINUFFT_VER "2.2.0" +// Smallest possible kernel spread width per dimension, in fine grid points +// (used only in spreadinterp.cpp) +#define MIN_NSPREAD 2 + // Largest possible kernel spread width per dimension, in fine grid points // (used only in spreadinterp.cpp) #define MAX_NSPREAD 16 diff --git a/src/finufft.cpp b/src/finufft.cpp index 03c1d9ac6..8b9c6006b 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -38,41 +38,41 @@ using namespace finufft::quadrature; Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: TYPE 1: - The type 1 NUFFT proceeds in three main steps: - 1) spread data to oversampled regular mesh using kernel. - 2) compute FFT on uniform mesh - 3) deconvolve by division of each Fourier mode independently by the kernel - Fourier series coeffs (not merely FFT of kernel), shuffle to output. - The kernel coeffs are precomputed in what is called step 0 in the code. + The type 1 NUFFT proceeds in three main steps: + 1) spread data to oversampled regular mesh using kernel. + 2) compute FFT on uniform mesh + 3) deconvolve by division of each Fourier mode independently by the kernel + Fourier series coeffs (not merely FFT of kernel), shuffle to output. + The kernel coeffs are precomputed in what is called step 0 in the code. Written with FFTW style complex arrays. Step 3a internally uses CPX, and Step 3b internally uses real arithmetic and FFTW style complex. TYPE 2: - The type 2 algorithm proceeds in three main steps: - 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff - 2) compute inverse FFT on uniform fine grid - 3) spread (dir=2, ie interpolate) data to regular mesh - The kernel coeffs are precomputed in what is called step 0 in the code. + The type 2 algorithm proceeds in three main steps: + 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff + 2) compute inverse FFT on uniform fine grid + 3) spread (dir=2, ie interpolate) data to regular mesh + The kernel coeffs are precomputed in what is called step 0 in the code. Written with FFTW style complex arrays. Step 0 internally uses CPX, and Step 1 internally uses real arithmetic and FFTW style complex. TYPE 3: - The type 3 algorithm is basically a type 2 (which is implemented precisely - as call to type 2) replacing the middle FFT (Step 2) of a type 1. - Beyond this, the new twists are: - i) nf1, number of upsampled points for the type-1, depends on the product - of interval widths containing input and output points (X*S). - ii) The deconvolve (post-amplify) step is division by the Fourier transform - of the scaled kernel, evaluated on the *nonuniform* output frequency - grid; this is done by direct approximation of the Fourier integral - using quadrature of the kernel function times exponentials. - iii) Shifts in x (real) and s (Fourier) are done to minimize the interval - half-widths X and S, hence nf1. + The type 3 algorithm is basically a type 2 (which is implemented precisely + as call to type 2) replacing the middle FFT (Step 2) of a type 1. + Beyond this, the new twists are: + i) nf1, number of upsampled points for the type-1, depends on the product + of interval widths containing input and output points (X*S). + ii) The deconvolve (post-amplify) step is division by the Fourier transform + of the scaled kernel, evaluated on the *nonuniform* output frequency + grid; this is done by direct approximation of the Fourier integral + using quadrature of the kernel function times exponentials. + iii) Shifts in x (real) and s (Fourier) are done to minimize the interval + half-widths X and S, hence nf1. No references to FFTW are needed here. CPX arithmetic is used. MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): - maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so - this is good only for small problems. + maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so + this is good only for small problems. Design notes for guru interface implementation: @@ -111,10 +111,10 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI *nf = next235even(*nf); // expensive at huge nf return 0; } else { - fprintf(stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " - "malloc\n", - __func__, (double)*nf, (double)MAX_NF); + fprintf( + stderr, + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n", + __func__, (double)*nf, (double)MAX_NF); return FINUFFT_ERR_MAXNALLOC; } } @@ -198,8 +198,8 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) Outputs: fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 FLTs) + divided by h = 2pi/n. + (should be allocated for at least nf/2+1 FLTs) Compare onedim_dct_kernel which has same interface, but computes DFT of sampled kernel, not quite the same object. @@ -253,8 +253,8 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts Inputs: nk - number of freqs k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. + Note, z is in grid-point units, and k values must be in [-pi, pi) for + accuracy. opts - spreading opts object, needed to eval kernel (must be already set up) Outputs: @@ -291,11 +291,11 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) - 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). + 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). fk is size-ms FLT complex array (2*ms FLTs alternating re,im parts) fw is a FFTW style complex array, ie FLT [nf1][2], essentially FLTs - alternating re,im parts. + alternating re,im parts. ker is real-valued FLT array of length nf1/2+1. Single thread only, but shouldn't matter since mostly data movement. @@ -305,7 +305,7 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI This could be removed by passing in an inverse kernel and doing mults. todo: rewrite w/ C++-complex I/O, check complex divide not slower than - real divide, or is there a way to force a real divide? + real divide, or is there a way to force a real divide? Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 */ @@ -328,8 +328,7 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI fk[pn++] = prefac * fw[nf1 + k][1] / ker[-k]; // im } } else { // read fk, write out to fw w/ zero padding... - for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where - // needed + for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where needed fw[k][0] = fw[k][1] = 0.0; } for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k @@ -352,14 +351,14 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) + 1: use FFT-style (pos then negative, on each dim) fk is complex array stored as 2*ms*mt FLTs alternating re,im parts, with - ms looped over fast and mt slow. + ms looped over fast and mt slow. fw is a FFTW style complex array, ie FLT [nf1*nf2][2], essentially FLTs - alternating re,im parts; again nf1 is fast and nf2 slow. + alternating re,im parts; again nf1 is fast and nf2 slow. ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1 - respectively. + respectively. Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 */ @@ -373,8 +372,7 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B pn = 2 * (k2max + 1) * ms; } // or, instead, FFT ordering if (dir == 2) // zero pad needed x-lines (contiguous in memory) - for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all - // dims + for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all dims fw[j][0] = fw[j][1] = 0.0; for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs // point fk and fw to the start of this y value's row (2* is for complex): @@ -395,14 +393,14 @@ void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, B if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) + 1: use FFT-style (pos then negative, on each dim) fk is complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with - ms looped over fastest and mu slowest. + ms looped over fastest and mu slowest. fw is a FFTW style complex array, ie FLT [nf1*nf2*nf3][2], effectively - FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest. + FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest. ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1, - and nf3/2+1 respectively. + and nf3/2+1 respectively. Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 */ @@ -439,7 +437,7 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch) Returns 0 (no error reporting for now). Notes: 1) cBatch is already assumed to have the correct offset, ie here we - read from the start of cBatch (unlike Malleo). fwBatch also has zero offset + read from the start of cBatch (unlike Malleo). fwBatch also has zero offset 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp Barnett 5/19/20, based on Malleo 2019. */ @@ -604,7 +602,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, p->tol = tol; p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - // choose overall # threads... + // choose overall # threads... #ifdef _OPENMP int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); int nthr = ompmaxnthr; // default: use as many as OMP gives us @@ -655,8 +653,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, if (type == 3) // could move to setpts, more known? p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, typ + // tol, 12-core xeon p->opts.upsampfac = 1.25; } if (p->opts.debug > 1) @@ -725,9 +723,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, } if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " - "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " - "batchSize=%d ", + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n " + " ntrans=%d nthr=%d batchSize=%d ", __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, p->batchSize); @@ -771,16 +768,16 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, timer.restart(); // plan the FFTW int *ns = GRIDSIZE_FOR_FFTW(p); - // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, - // idist, ot, onembed, ostride, odist, sign, flags + // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, + // ot, onembed, ostride, odist, sign, flags { std::lock_guard lock(fftw_lock); // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads. - // FIXME: Since this might override what the user wants for fftw, we'd like to - // set it just for our one plan and then revert to the user value. - // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and - // there isn't a convenient mechanism to probe the version + // FIXME: Since this might override what the user wants for fftw, we'd like to set + // it just for our one plan and then revert to the user value. Unfortunately + // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a + // convenient mechanism to probe the version FFTW_PLAN_TH(nthr_fft); p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, @@ -842,9 +839,9 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT if (ier) // no warnings allowed here return ier; timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts calls + // causing memory leak. We don't know it is the same size as before, so we have to + // malloc each time. if (p->sortIndices) free(p->sortIndices); p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { @@ -969,8 +966,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT FLT phase = p->t3P.D1 * xj[j]; if (d > 1) phase += p->t3P.D2 * yj[j]; if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler - // e^{+-i.phase} + p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} } } else for (BIGINT j = 0; j < nj; ++j) @@ -981,11 +977,9 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT for (BIGINT k = 0; k < nk; ++k) { p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < - // pi/R + p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < pi/R if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < - // pi/R + p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < pi/R } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... @@ -1030,9 +1024,9 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts - // calls causing memory leak. We don't know it is the same size as before, so we - // have to malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts calls + // causing memory leak. We don't know it is the same size as before, so we have to + // malloc each time. if (p->sortIndices) free(p->sortIndices); p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { @@ -1079,16 +1073,16 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { /* See ../docs/cguru.doc for current documentation. - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. + */ CNTime timer; timer.start(); @@ -1154,8 +1148,7 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, - t_deconv = 0.0; // accumulated timings + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, t_deconv = 0.0; // accumulated timings if (p->opts.debug) printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, p->nbatch, p->batchSize); @@ -1187,15 +1180,14 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { t_spr += timer.elapsedsec(); // for (int j=0;jnf1;++j) - // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // - // debug + // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // debug // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array - still the same size, as Andrea explained; just wastes a few flops) */ + still the same size, as Andrea explained; just wastes a few flops) */ FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX *)(p->fwBatch)); t_t2 += timer.elapsedsec(); diff --git a/src/ker_horner_allw_loop_constexpr.h b/src/ker_horner_allw_loop_constexpr.h new file mode 100644 index 000000000..25a791ddb --- /dev/null +++ b/src/ker_horner_allw_loop_constexpr.h @@ -0,0 +1,913 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. +#include + +template +constexpr std::array, nc> get_horner_coeffs() noexcept { + if constexpr (w == 2) { + return std::array, nc>{ + {{4.5147043243215315E+01, 4.5147043243215300E+01}, + {5.7408070938221300E+01, -5.7408070938221293E+01}, + {-1.8395117920046484E+00, -1.8395117920046560E+00}, + {-2.0382426253182082E+01, 2.0382426253182086E+01}, + {-2.0940804433577420E+00, -2.0940804433577389E+00}}}; + } else if constexpr (w == 3) { + return std::array, nc>{ + {{1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02}, + {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02}, + {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02}, + {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01}, + {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01}, + {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00}}}; + } else if constexpr (w == 4) { + return std::array, nc>{ + {{5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, + 5.4284366850213223E+02}, + {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, + -1.4650917259256937E+03}, + {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, + 1.4186910680718347E+03}, + {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, + -5.1133995502497424E+02}, + {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, + -4.8293622641174061E+01}, + {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, + 7.8386867802392359E+01}, + {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, + -1.0039212571700640E+01}}}; + } else if constexpr (w == 5) { + return std::array, nc>{ + {{9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, + 3.7794697666613283E+04, 9.9223677575398403E+02}, + {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, + -3.7938404259811381E+04, -3.0430174925083829E+03}, + {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, + 7.7501368899498730E+03, 3.6092689177271218E+03}, + {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, + 3.8875294641277369E+03, -1.9990077310495412E+03}, + {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, + -1.5861137916762643E+03, 4.0071733590403909E+02}, + {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, + -1.2316471075214508E+02, 9.1301168206167233E+01}, + {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, + 1.1960590540262307E+02, -5.5339722671223605E+01}, + {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, + -2.2839981872943818E+00, 3.3762488150341459E+00}}}; + } else if constexpr (w == 6) { + return std::array, nc>{ + {{2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, + 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03}, + {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, + -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}, + {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, + -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04}, + {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, + 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03}, + {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, + 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03}, + {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, + -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02}, + {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, + -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02}, + {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, + 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01}, + {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, + -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00}}}; + } else if constexpr (w == 7) { + return std::array, nc>{ + {{3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, + 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, + 3.9948351830642519E+03}, + {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, + -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, + -1.5290160332958067E+04}, + {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, + -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, + 2.4458227486795113E+04}, + {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, + 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, + -2.1166189345866893E+04}, + {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, + 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, + 1.0542795672361213E+04}, + {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, + 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, + -2.7903491906078298E+03}, + {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, + -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, + 1.6069721419533221E+02}, + {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, + 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, + 1.2289277375319763E+02}, + {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, + 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, + -3.2270164900224913E+01}, + {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, + 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, + 1.4761410890866353E-01}}}; + } else if constexpr (w == 8) { + return std::array, nc>{ + {{7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, + 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, + 1.7297637497600049E+06, 7.3898000697447915E+03}, + {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, + 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, + -3.1853145713323941E+06, -3.0719636811267606E+04}, + {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, + -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, + 2.4101183255475126E+06, 5.4488498478251728E+04}, + {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, + -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, + -9.0469037926849339E+05, -5.3926359802542138E+04}, + {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, + 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, + 1.3079802224392109E+05, 3.2444118016247590E+04}, + {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, + 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, + 2.2700360645707628E+04, -1.1864306345505294E+04}, + {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, + -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, + -1.1569135767377924E+04, 2.2812256770903286E+03}, + {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, + -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, + 9.7513976461209836E+02, -8.5503535637013552E+00}, + {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, + 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, + 2.8246898554219217E+02, -1.0230637348345138E+02}, + {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, + 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, + -6.1692257626845532E+01, 1.9200143062947120E+01}, + {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, + -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, + -1.7334408840526812E+00, 3.7894993760636758E-01}}}; + } else if constexpr (w == 9) { + return std::array, nc>{ + {{1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, + 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, + 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04}, + {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, + 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, + -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04}, + {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, + -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, + 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05}, + {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, + -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, + -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05}, + {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, + -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, + -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04}, + {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, + 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, + 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04}, + {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, + 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, + -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04}, + {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, + -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, + -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03}, + {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, + -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, + 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01}, + {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, + 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, + 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01}, + {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, + 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, + -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00}}}; + } else if constexpr (w == 10) { + return std::array, nc>{{{ + 2.2594586605749264E+04, + 1.3595989066786593E+07, + 4.4723032442444897E+08, + 3.3781755837397518E+09, + 8.6836783895849819E+09, + 8.6836783895849762E+09, + 3.3781755837397494E+09, + 4.4723032442444897E+08, + 1.3595989066786474E+07, + 2.2594586605749344E+04, + }, + { + 1.0729981697645642E+05, + 3.0651490267742988E+07, + 5.9387966085130465E+08, + 2.4434902657508330E+09, + 2.0073077861288922E+09, + -2.0073077861288943E+09, + -2.4434902657508330E+09, + -5.9387966085130453E+08, + -3.0651490267742816E+07, + -1.0729981697645638E+05, + }, + { + 2.2340399734184606E+05, + 3.0258214643190462E+07, + 3.1512411458738232E+08, + 4.3618276932319808E+08, + -7.8178848450497293E+08, + -7.8178848450497019E+08, + 4.3618276932319826E+08, + 3.1512411458738232E+08, + 3.0258214643190313E+07, + 2.2340399734184548E+05, + }, + { + 2.6917433004353486E+05, + 1.6875651476661228E+07, + 7.4664745481963441E+07, + -9.5882157211118385E+07, + -2.0622994435532519E+08, + 2.0622994435532743E+08, + 9.5882157211118177E+07, + -7.4664745481963515E+07, + -1.6875651476661161E+07, + -2.6917433004353428E+05, + }, + { + 2.0818422772177903E+05, + 5.6084730690362519E+06, + 1.4435118192351763E+06, + -4.0063869969544649E+07, + 3.2803674392747045E+07, + 3.2803674392746095E+07, + -4.0063869969546899E+07, + 1.4435118192351642E+06, + 5.6084730690362034E+06, + 2.0818422772177853E+05, + }, + { + 1.0781139496011091E+05, + 9.9202615851199068E+05, + -3.3266265543962116E+06, + -4.8557049011479173E+05, + 1.0176155522772279E+07, + -1.0176155522772269E+07, + 4.8557049011678610E+05, + 3.3266265543963453E+06, + -9.9202615851196018E+05, + -1.0781139496011072E+05, + }, + { + 3.7380102688153558E+04, + 1.2716675000355666E+04, + -6.2163527451774501E+05, + 1.4157962667184104E+06, + -8.4419693137680157E+05, + -8.4419693137743860E+05, + 1.4157962667189445E+06, + -6.2163527451771160E+05, + 1.2716675000340010E+04, + 3.7380102688153442E+04, + }, + { + 8.1238936393894646E+03, + -3.4872365530450072E+04, + 2.3913680325196314E+04, + 1.2428850301830019E+05, + -3.2158255329716846E+05, + 3.2158255329951923E+05, + -1.2428850301867779E+05, + -2.3913680325277423E+04, + 3.4872365530457188E+04, + -8.1238936393894255E+03, + }, + { + 7.8515926628982663E+02, + -6.6607899119372642E+03, + 2.0167398338513311E+04, + -2.8951401344519112E+04, + 1.4622828142848679E+04, + 1.4622828143544031E+04, + -2.8951401346900999E+04, + 2.0167398338398041E+04, + -6.6607899119505255E+03, + 7.8515926628967964E+02, + }, + { + -1.0147176570537010E+02, + -3.5304284185385157E+01, + 1.3576976854876134E+03, + -4.3921059353471856E+03, + 7.3232085271125388E+03, + -7.3232085273978546E+03, + 4.3921059367737662E+03, + -1.3576976854043962E+03, + 3.5304284185385157E+01, + 1.0147176570550941E+02, + }, + { + -4.3161545259389186E+01, + 1.5498490981579428E+02, + -3.1771250774232175E+02, + 3.7215448796427023E+02, + -1.7181762832770994E+02, + -1.7181763036843782E+02, + 3.7215448789408123E+02, + -3.1771250773692140E+02, + 1.5498490982186786E+02, + -4.3161545259547800E+01, + }, + { + -4.2916172038214198E+00, + 1.7402146071148604E+01, + -4.7947588069135868E+01, + 9.2697698088029625E+01, + -1.2821427596894478E+02, + 1.2821427705670308E+02, + -9.2697698297776569E+01, + 4.7947588093524907E+01, + -1.7402146074502035E+01, + 4.2916172038452141E+00, + }}}; + } else if constexpr (w == 11) { + return std::array, nc>{{{ + 3.7794653219809625E+04, + 3.4782300224660739E+07, + 1.6188020733727551E+09, + 1.7196758809615005E+10, + 6.3754384857724617E+10, + 9.7196447559193497E+10, + 6.3754384857724617E+10, + 1.7196758809614998E+10, + 1.6188020733727560E+09, + 3.4782300224660769E+07, + 3.7794653219808984E+04, + }, + { + 1.8969206922085886E+05, + 8.4769319065313652E+07, + 2.4230555767723408E+09, + 1.5439732722639101E+10, + 2.7112836839612309E+10, + 2.5609833368650835E-06, + -2.7112836839612328E+10, + -1.5439732722639105E+10, + -2.4230555767723408E+09, + -8.4769319065313682E+07, + -1.8969206922085711E+05, + }, + { + 4.2138380313901440E+05, + 9.2050522922791913E+07, + 1.5259983101266613E+09, + 4.7070559561237173E+09, + -1.2448027572952359E+09, + -1.0161446790279301E+10, + -1.2448027572952316E+09, + 4.7070559561237268E+09, + 1.5259983101266615E+09, + 9.2050522922791913E+07, + 4.2138380313901149E+05, + }, + { + 5.4814313598122005E+05, + 5.8085130777589552E+07, + 4.9484006166551048E+08, + 1.6222124676640952E+08, + -2.0440440381345339E+09, + 9.1416457449079640E-06, + 2.0440440381345336E+09, + -1.6222124676640788E+08, + -4.9484006166551071E+08, + -5.8085130777589560E+07, + -5.4814313598121714E+05, + }, + { + 4.6495183529254980E+05, + 2.3067199578027144E+07, + 6.9832590192482382E+07, + -2.2024799260683522E+08, + -1.2820270942588677E+08, + 5.1017181199129778E+08, + -1.2820270942588474E+08, + -2.2024799260683942E+08, + 6.9832590192482322E+07, + 2.3067199578027155E+07, + 4.6495183529254742E+05, + }, + { + 2.7021781043532980E+05, + 5.6764510325100143E+06, + -5.5650761736748898E+06, + -3.9907385617900200E+07, + 7.2453390663687646E+07, + 1.2300109686762266E-05, + -7.2453390663684472E+07, + 3.9907385617899075E+07, + 5.5650761736749066E+06, + -5.6764510325099993E+06, + -2.7021781043532846E+05, + }, + { + 1.0933249308680627E+05, + 6.9586821127987828E+05, + -3.6860240321937902E+06, + 2.7428169457736355E+06, + 8.3392008440593518E+06, + -1.6402201025046850E+07, + 8.3392008440698013E+06, + 2.7428169457778852E+06, + -3.6860240321937371E+06, + 6.9586821127989423E+05, + 1.0933249308680571E+05, + }, + { + 3.0203516161820498E+04, + -3.6879059542768438E+04, + -4.1141031216788280E+05, + 1.4111389975267777E+06, + -1.5914376635331670E+06, + 9.4095582602103753E-06, + 1.5914376635379130E+06, + -1.4111389975247320E+06, + 4.1141031216776522E+05, + 3.6879059542750314E+04, + -3.0203516161820549E+04, + }, + { + 5.1670143574922731E+03, + -2.8613147115372190E+04, + 4.3560195427081359E+04, + 4.8438679582765450E+04, + -2.5856630639231802E+05, + 3.7994883866738499E+05, + -2.5856630640319458E+05, + 4.8438679579510936E+04, + 4.3560195426766244E+04, + -2.8613147115376054E+04, + 5.1670143574922913E+03, + }, + { + 3.0888018539740131E+02, + -3.7949446187471626E+03, + 1.4313303204988082E+04, + -2.6681600235594462E+04, + 2.3856005166166615E+04, + 8.6424601730164351E-06, + -2.3856005155895236E+04, + 2.6681600234453199E+04, + -1.4313303205083188E+04, + 3.7949446187583080E+03, + -3.0888018539728523E+02, + }, + { + -8.3747489794189363E+01, + 1.1948077479405792E+02, + 4.8528498015072080E+02, + -2.5024391114755094E+03, + 5.3511195318669425E+03, + -6.7655484107390166E+03, + 5.3511195362291774E+03, + -2.5024391131167667E+03, + 4.8528498019392708E+02, + 1.1948077480620087E+02, + -8.3747489794426258E+01, + }, + { + -2.2640047135517630E+01, + 9.0840898563949466E+01, + -2.1597187544386938E+02, + 3.1511229111443720E+02, + -2.4856617998395282E+02, + 6.1683918215190516E-06, + 2.4856618439352349E+02, + -3.1511228757800421E+02, + 2.1597187557069353E+02, + -9.0840898570046704E+01, + 2.2640047135565219E+01, + }, + { + -1.6306382886201207E+00, + 7.3325946591320434E+00, + -2.3241017682854558E+01, + 5.1715494398901185E+01, + -8.2673000279130790E+01, + 9.6489719151212370E+01, + -8.2673010381149226E+01, + 5.1715494328769353E+01, + -2.3241018024860580E+01, + 7.3325946448852415E+00, + -1.6306382886460551E+00, + }}}; + } else if constexpr (w == 12) { + return std::array, nc>{ + {{6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, + 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, + 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, + 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04}, + {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, + 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, + -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, + -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}, + {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, + 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, + -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, + 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}, + {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, + 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, + 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, + -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}, + {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, + -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, + 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, + 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}, + {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, + -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, + -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, + -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}, + {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, + -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, + -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, + -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}, + {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, + 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, + 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, + 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}, + {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, + 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, + 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, + -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}, + {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, + -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, + -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, + -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}, + {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, + -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, + -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, + 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}, + {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, + -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, + 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, + -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}, + {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, + 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, + 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, + -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}, + {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, + 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, + -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, + 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}}}; + } else if constexpr (w == 13) { + return std::array, nc>{ + {{9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, + 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, + 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, + 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, + 9.8715725867496090E+04}, + {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, + 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, + -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, + -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, + -5.4491110456935526E+05}, + {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, + 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, + -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, + 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, + 1.3504711883426069E+06}, + {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, + 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, + 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, + -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, + -1.9937206140846489E+06}, + {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, + 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, + 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, + 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, + 1.9607419630386417E+06}, + {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, + -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, + 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, + 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, + -1.3593773865640305E+06}, + {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, + -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, + -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, + -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, + 6.8417206432039209E+05}, + {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, + -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, + 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, + 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, + -2.5248269397037517E+05}, + {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, + 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, + 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, + 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, + 6.7530100970876316E+04}, + {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, + 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, + 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, + -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, + -1.2421368748961073E+04}, + {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, + -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, + -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, + -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, + 1.2904654687545376E+03}, + {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, + -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, + 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, + 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, + 1.9043622268312891E+01}, + {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, + -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, + 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, + -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, + -3.0093984465884773E+01}, + {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, + 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, + 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, + -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, + 4.3050286009485790E+00}, + {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, + 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, + -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, + 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, + -1.0957333740315604E-01}}}; + } else if constexpr (w == 14) { + return std::array, nc>{ + {{1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, + 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, + 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, + 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, + 4.4723032442444855E+08, 1.5499533202970232E+05}, + {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, + 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, + 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, + -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, + -1.3065352538728662E+09, -8.9188339002979454E+05}, + {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, + 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, + -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, + 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, + 1.7532505043698275E+09, 2.3170473769380399E+06}, + {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, + 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, + -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, + -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, + -1.4278058213962219E+09, -3.6089249230396664E+06}, + {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, + 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, + 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, + -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, + 7.8376718099107552E+08, 3.7733555140852560E+06}, + {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, + -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, + 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, + 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, + -3.0340753492383808E+08, -2.8079157920112377E+06}, + {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, + -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, + -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, + -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, + 8.3513615594416171E+07, 1.5361613559533576E+06}, + {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, + -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, + -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, + -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, + -1.5741723594963137E+07, -6.2759409419590747E+05}, + {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, + 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, + 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, + 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, + 1.7156606891560503E+06, 1.9151404903936724E+05}, + {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, + 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, + 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, + 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, + 2.2565910606306688E+03, -4.2715272622820135E+04}, + {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, + 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, + -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, + -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, + -3.5474227033406372E+04, 6.4806786523010323E+03}, + {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, + -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, + -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, + 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, + 5.5416668533342381E+03, -4.9913632906195977E+02}, + {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, + -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, + 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, + 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, + -1.8970588700495102E+02, -3.3076333168231550E+01}, + {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, + -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, + 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, + -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, + -5.7000699100179844E+01, 1.4394533639240331E+01}, + {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, + 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, + -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, + -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, + 8.5113924808491728E+00, -1.5925952194145006E+00}, + {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, + 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, + -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, + 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, + -1.2876100566420701E-01, -1.5984859433053292E-02}}}; + } else if constexpr (w == 15) { + return std::array, nc>{ + {{2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, + 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, + 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, + 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, + 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05}, + {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, + 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, + 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, + -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, + -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06}, + {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, + 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, + -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, + 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, + 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06}, + {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, + 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, + -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, + 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, + -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06}, + {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, + 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, + -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, + -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, + 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06}, + {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, + 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, + 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, + 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, + -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06}, + {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, + -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, + 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, + 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, + 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06}, + {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, + -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, + -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, + -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, + -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06}, + {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, + -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, + -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, + -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, + -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05}, + {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, + 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, + 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, + 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, + 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05}, + {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, + 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, + 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, + -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, + -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04}, + {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, + 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, + -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, + -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, + -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03}, + {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, + -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, + -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, + 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, + 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02}, + {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, + -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, + 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, + 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, + -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01}, + {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, + 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, + 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, + -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, + -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00}, + {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, + 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, + -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, + -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, + 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01}, + {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, + 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, + -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, + 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, + -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}}}; + } else if constexpr (w == 16) { + return std::array, nc>{ + {{3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, + 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, + 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, + 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, + 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, + 3.6434551345571183E+05}, + {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, + 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, + 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, + -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, + -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, + -2.2576246485480373E+06}, + {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, + 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, + 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, + 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, + 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, + 6.3730995546265030E+06}, + {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, + 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, + -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, + 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, + -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, + -1.0896915393078227E+07}, + {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, + 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, + -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, + -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, + 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, + 1.2655725616100591E+07}, + {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, + 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, + 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, + -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, + -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, + -1.0609303958036322E+07}, + {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, + 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, + 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, + 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, + 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, + 6.6544809363384554E+06}, + {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, + -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, + -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, + 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, + 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, + -3.1906872142825015E+06}, + {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, + -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, + -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, + -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, + -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, + 1.1821527096621762E+06}, + {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, + -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, + -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, + 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, + 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, + -3.3854610744279937E+05}, + {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, + 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, + 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, + 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, + 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, + 7.3893334077307401E+04}, + {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, + 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, + 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, + -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, + -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, + -1.1778892113376129E+04}, + {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, + 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, + -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, + -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, + 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, + 1.2019749667911419E+03}, + {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, + -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, + -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, + 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, + 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, + -3.1189837631106176E+01}, + {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, + -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, + 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, + 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, + -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, + -1.2975319073977776E+01}, + {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, + 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, + 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, + -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, + -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, + 2.3155118723150525E+00}, + {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, + 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, + -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, + -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, + 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, + -1.5401723756214594E-01}, + {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, + 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, + -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, + 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, + -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, + -1.1808834826225629E-02}}}; + } else { + static_assert(w >= 2, "w must be >= 2"); + static_assert(w <= 16, "w must be <= 16"); + return {}; + } +}; diff --git a/src/ker_lowupsampfac_horner_allw_loop_constexpr.c b/src/ker_lowupsampfac_horner_allw_loop_constexpr.c new file mode 100644 index 000000000..cfbbb0964 --- /dev/null +++ b/src/ker_lowupsampfac_horner_allw_loop_constexpr.c @@ -0,0 +1,192 @@ +// Code generated by gen_all_horner_C_code.m in finufft/devel +// Authors: Alex Barnett & Ludvig af Klinteberg. +// (C) The Simons Foundation, Inc. + if constexpr(w==2) { + FLT c0[] = {2.3711015472112514E+01, 2.3711015472112514E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {2.5079742199350562E+01, -2.5079742199350562E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {-3.5023281580177050E+00, -3.5023281580177086E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {-7.3894949249195587E+00, 7.3894949249195632E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); + } else if constexpr(w==3) { + FLT c0[] = {5.9620016143346824E+01, 2.4110216701187497E+02, 5.9620016148621815E+01, 0.0000000000000000E+00}; + FLT c1[] = {9.7575520958604258E+01, 9.4807967775797928E-16, -9.7575520952908519E+01, 0.0000000000000000E+00}; + FLT c2[] = {3.5838417859768512E+01, -7.3472145274965371E+01, 3.5838417865129472E+01, 0.0000000000000000E+00}; + FLT c3[] = {-1.0721643298166471E+01, -2.1299978194824344E-16, 1.0721643303220413E+01, 0.0000000000000000E+00}; + FLT c4[] = {-7.0570630207138318E+00, 9.1538553399011260E+00, -7.0570630151506633E+00, 0.0000000000000000E+00}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); + } else if constexpr(w==4) { + FLT c0[] = {1.2612470018753689E+02, 1.1896204292999116E+03, 1.1896204292999118E+03, 1.2612470018753696E+02}; + FLT c1[] = {2.6158034850676626E+02, 5.6161104654809810E+02, -5.6161104654809844E+02, -2.6158034850676620E+02}; + FLT c2[] = {1.7145379463699527E+02, -1.6695967127766517E+02, -1.6695967127766514E+02, 1.7145379463699527E+02}; + FLT c3[] = {2.3525961965887870E+01, -1.0057439659768858E+02, 1.0057439659768873E+02, -2.3525961965887827E+01}; + FLT c4[] = {-1.5608307370340880E+01, 9.5627412100260845E+00, 9.5627412100260205E+00, -1.5608307370340908E+01}; + FLT c5[] = {-4.5715207776748699E+00, 7.9904373067895493E+00, -7.9904373067893877E+00, 4.5715207776749462E+00}; + for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); + } else if constexpr(w==5) { + FLT c0[] = {2.4106943677442615E+02, 4.3538384278025542E+03, 9.3397486707381995E+03, 4.3538384278025515E+03, 2.4106943677442607E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {5.8781364250328272E+02, 3.4742855804122028E+03, -7.3041306797303120E-14, -3.4742855804122009E+03, -5.8781364250328249E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {5.1234107167555862E+02, 3.5219546517037116E+02, -1.7076861141633149E+03, 3.5219546517037247E+02, 5.1234107167555862E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {1.7540956907856057E+02, -3.5792356187777074E+02, -4.9888896652511712E-13, 3.5792356187777165E+02, -1.7540956907856059E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {-2.1768066955094961E-01, -7.8322173187697558E+01, 1.3904039464934516E+02, -7.8322173187697842E+01, -2.1768066955103071E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {-1.4207955403641256E+01, 1.6019466986221790E+01, 5.4386376890865855E-13, -1.6019466986220916E+01, 1.4207955403641320E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {-2.1966493586753826E+00, 5.0672636163194582E+00, -6.7340544905090631E+00, 5.0672636163189448E+00, -2.1966493586753089E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); + } else if constexpr(w==6) { + FLT c0[] = {4.3011762559089101E+02, 1.3368828836127070E+04, 4.9861340433371224E+04, 4.9861340433371253E+04, 1.3368828836127073E+04, 4.3011762559835148E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {1.1857225840065141E+03, 1.4112553227730617E+04, 1.5410005180819440E+04, -1.5410005180819426E+04, -1.4112553227730616E+04, -1.1857225839984601E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {1.2460481448413077E+03, 4.3127030215084960E+03, -5.5438591621431169E+03, -5.5438591621431306E+03, 4.3127030215084960E+03, 1.2460481448488902E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {6.0825549344387753E+02, -3.4106010789547094E+02, -1.9775725023673197E+03, 1.9775725023673208E+03, 3.4106010789547116E+02, -6.0825549343673094E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {1.1264961069783706E+02, -3.9740822717991142E+02, 2.7557540616463064E+02, 2.7557540616462472E+02, -3.9740822717991210E+02, 1.1264961070570448E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {-1.5387906304333878E+01, -3.2640579296387394E+01, 1.1683718215647470E+02, -1.1683718215646800E+02, 3.2640579296390861E+01, 1.5387906311562851E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {-9.3947198873910249E+00, 1.5069930500881778E+01, -8.0900452409597179E+00, -8.0900452409538364E+00, 1.5069930500884301E+01, -9.3947198802581902E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c7[] = {-5.6048841964539509E-01, 2.3377422080924530E+00, -4.2391567591836514E+00, 4.2391567591841817E+00, -2.3377422080928629E+00, 5.6048842664294984E-01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); + } else if constexpr(w==7) { + FLT c0[] = {7.2950392616203249E+02, 3.6439117038309480E+04, 2.1220891582018422E+05, 3.6180058567561524E+05, 2.1220891582018445E+05, 3.6439117038309487E+04, 7.2950392617434545E+02, 0.0000000000000000E+00}; + FLT c1[] = {2.2197790785452576E+03, 4.6392067080426248E+04, 1.1568051746995670E+05, -1.1902861988308852E-11, -1.1568051746995671E+05, -4.6392067080426241E+04, -2.2197790785319785E+03, 0.0000000000000000E+00}; + FLT c2[] = {2.6796845075663955E+03, 2.0921129984587249E+04, 3.9399551345574849E+01, -4.7251335435527435E+04, 3.9399551345580633E+01, 2.0921129984587245E+04, 2.6796845075789142E+03, 0.0000000000000000E+00}; + FLT c3[] = {1.6253748990844499E+03, 2.6138488347211564E+03, -1.0037546705421508E+04, 2.6823166126907972E-11, 1.0037546705421508E+04, -2.6138488347211546E+03, -1.6253748990726619E+03, 0.0000000000000000E+00}; + FLT c4[] = {4.9106375852553418E+02, -8.6668269315416171E+02, -1.0513434716618249E+03, 2.8444456471590756E+03, -1.0513434716618387E+03, -8.6668269315416057E+02, 4.9106375853851472E+02, 0.0000000000000000E+00}; + FLT c5[] = {4.0739167949763157E+01, -2.8515155742293922E+02, 3.9930326803801455E+02, 2.4847312048933061E-11, -3.9930326803798215E+02, 2.8515155742293899E+02, -4.0739167937835738E+01, 0.0000000000000000E+00}; + FLT c6[] = {-1.7148987139838667E+01, 7.5799002551700223E-01, 6.3260304953160343E+01, -1.0529869309160161E+02, 6.3260304953194023E+01, 7.5799002552709915E-01, -1.7148987128069749E+01, 0.0000000000000000E+00}; + FLT c7[] = {-4.5424411501060264E+00, 9.8749254058318616E+00, -9.6456179777547195E+00, 2.0621161109877312E-11, 9.6456179778118027E+00, -9.8749254058319202E+00, 4.5424411616514604E+00, 0.0000000000000000E+00}; + FLT c8[] = {-5.0793946806832954E-02, 7.3273813711856639E-01, -2.0117140544738263E+00, 2.6999257940856816E+00, -2.0117140545416512E+00, 7.3273813711318592E-01, -5.0793935653327994E-02, 0.0000000000000000E+00}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); + } else if constexpr(w==8) { + FLT c0[] = {1.1895823653767145E+03, 9.0980236725236929E+04, 7.7438826909537485E+05, 2.0077596413122697E+06, 2.0077596413122697E+06, 7.7438826909537497E+05, 9.0980236725236929E+04, 1.1895823653767147E+03}; + FLT c1[] = {3.9313191526977798E+03, 1.3318570706800820E+05, 5.7275848637687636E+05, 4.6250273225257988E+05, -4.6250273225257976E+05, -5.7275848637687659E+05, -1.3318570706800820E+05, -3.9313191526977798E+03}; + FLT c2[] = {5.2976026193612370E+03, 7.5628970871188430E+04, 1.0073339198368321E+05, -1.8165150843791291E+05, -1.8165150843791291E+05, 1.0073339198368321E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; + FLT c3[] = {3.7552239608473842E+03, 1.8376340228970901E+04, -2.3878081117551585E+04, -4.6296734056047833E+04, 4.6296734056048226E+04, 2.3878081117551632E+04, -1.8376340228970901E+04, -3.7552239608473833E+03}; + FLT c4[] = {1.4742862505418652E+03, 1.2842168112178376E+02, -9.1969665138398723E+03, 7.5990739935234687E+03, 7.5990739935234151E+03, -9.1969665138399178E+03, 1.2842168112178072E+02, 1.4742862505418645E+03}; + FLT c5[] = {2.8158981009344416E+02, -8.8613607108855206E+02, 5.3457145342334378E+01, 2.1750989694614777E+03, -2.1750989694609211E+03, -5.3457145342173561E+01, 8.8613607108856843E+02, -2.8158981009344393E+02}; + FLT c6[] = {-1.4786862436240726E+00, -1.3935442261830281E+02, 3.2599325739083491E+02, -1.9541889343332295E+02, -1.9541889343339443E+02, 3.2599325739083696E+02, -1.3935442261827953E+02, -1.4786862436237442E+00}; + FLT c7[] = {-1.1542034522902307E+01, 1.2000512051397084E+01, 1.9687328710129744E+01, -6.3962883082482271E+01, 6.3962883082874910E+01, -1.9687328710101575E+01, -1.2000512051407391E+01, 1.1542034522902124E+01}; + FLT c8[] = {-1.7448292513542445E+00, 4.8577330433956609E+00, -6.8794163043773890E+00, 3.4611708987408365E+00, 3.4611708985348386E+00, -6.8794163043605385E+00, 4.8577330433771184E+00, -1.7448292513550807E+00}; + FLT c9[] = {1.5044951479021193E-01, 9.6230159355094713E-02, -7.0399250398052082E-01, 1.3251401132916929E+00, -1.3251401128795544E+00, 7.0399250407339709E-01, -9.6230159355094713E-02, -1.5044951479003055E-01}; + for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if constexpr(w==9) { + FLT c0[] = {1.8793738965776997E+03, 2.1220891582018419E+05, 2.5233246441351641E+06, 9.2877384983420596E+06, 1.4015330434461458E+07, 9.2877384983420689E+06, 2.5233246441351632E+06, 2.1220891582018507E+05, 1.8793738965777015E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {6.6675066501609344E+03, 3.4704155240986997E+05, 2.2890184838322559E+06, 3.8705035445351214E+06, -1.6037058324963857E-09, -3.8705035445351251E+06, -2.2890184838322555E+06, -3.4704155240987107E+05, -6.6675066501609363E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092200E+05, -2.1140963571671984E+05, -1.4236515118873848E+06, -2.1140963571672366E+05, 6.8167589492092165E+05, 2.3171563090202425E+05, 9.8412775404612312E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {7.8762358364031033E+03, 7.6500585979636104E+04, 1.2434778984075023E+04, -2.8572091469430045E+05, 1.5952874106327477E-09, 2.8572091469430359E+05, -1.2434778984075045E+04, -7.6500585979636220E+04, -7.8762358364031052E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {3.6941911906762084E+03, 9.9232929169975941E+03, -3.3472877669902169E+04, -1.4082384858052235E+04, 6.7911966136972551E+04, -1.4082384858047793E+04, -3.3472877669902322E+04, 9.9232929169976087E+03, 3.6941911906762070E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {9.8900189723050266E+02, -1.2736589324621855E+03, -5.0407308390126955E+03, 9.8914296140171609E+03, 1.0742991696587890E-09, -9.8914296140222541E+03, 5.0407308390134704E+03, 1.2736589324621880E+03, -9.8900189723050198E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {1.1165868717715853E+02, -5.9057035448564977E+02, 5.5860705835603983E+02, 9.1996097522959656E+02, -2.0290255886377897E+03, 9.1996097523001129E+02, 5.5860705835622480E+02, -5.9057035448564693E+02, 1.1165868717715870E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c7[] = {-1.3142584300868881E+01, -4.2852762793304592E+01, 1.8188640945795066E+02, -2.1362000457567430E+02, 6.1024810759112463E-10, 2.1362000457722939E+02, -1.8188640945795305E+02, 4.2852762793363922E+01, 1.3142584300866494E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c8[] = {-5.8088068374878068E+00, 1.0201832931362965E+01, -3.5220973519213472E-01, -2.6632420896811951E+01, 4.2737607182672249E+01, -2.6632420895534445E+01, -3.5220973562147767E-01, 1.0201832931230712E+01, -5.8088068374901178E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c9[] = {-4.0642645973308456E-01, 1.8389772328416343E+00, -3.5549484953682806E+00, 3.2273562233914270E+00, 1.3413454081272250E-09, -3.2273562258526494E+00, 3.5549484959023196E+00, -1.8389772328242200E+00, 4.0642645973371377E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); + } else if constexpr(w==10) { + FLT c0[] = {2.8923571298063562E+03, 4.6856831608341925E+05, 7.5304732752870023E+06, 3.7576537584215783E+07, 7.9591606307847857E+07, 7.9591606307847857E+07, 3.7576537584215745E+07, 7.5304732752870042E+06, 4.6856831608341780E+05, 2.8923571298063575E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {1.0919387804943191E+04, 8.3976685277206497E+05, 7.9494027659552367E+06, 2.1606786285174552E+07, 1.4625897641453246E+07, -1.4625897641453277E+07, -2.1606786285174549E+07, -7.9494027659552367E+06, -8.3976685277206241E+05, -1.0919387804943171E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {1.7418455635504150E+04, 6.3489952164419880E+05, 3.1358985409389879E+06, 2.2547438801903646E+06, -6.0429762783920728E+06, -6.0429762783920513E+06, 2.2547438801903692E+06, 3.1358985409389860E+06, 6.3489952164419706E+05, 1.7418455635504110E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {1.5396188098732160E+04, 2.5490607173283451E+05, 4.2818880748176615E+05, -9.5435463094349275E+05, -1.2004850139039254E+06, 1.2004850139039545E+06, 9.5435463094349345E+05, -4.2818880748176581E+05, -2.5490607173283395E+05, -1.5396188098732138E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {8.2616700456447434E+03, 5.2880641964112285E+04, -6.1165055141131161E+04, -2.1590299490711108E+05, 2.1595822052157650E+05, 2.1595822052157007E+05, -2.1590299490713840E+05, -6.1165055141131197E+04, 5.2880641964112183E+04, 8.2616700456447306E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {2.7267169079066489E+03, 2.4572549134030801E+03, -2.6065821571078384E+04, 1.3919259807559451E+04, 4.6802084705699206E+04, -4.6802084705714289E+04, -1.3919259807536537E+04, 2.6065821571078890E+04, -2.4572549134029036E+03, -2.7267169079066425E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {5.0402062537834070E+02, -1.3640153425625381E+03, -1.4063198459019245E+03, 7.0858129627834105E+03, -4.8375233777605163E+03, -4.8375233777670810E+03, 7.0858129627894641E+03, -1.4063198459014579E+03, -1.3640153425626913E+03, 5.0402062537833700E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c7[] = {2.4199726682542348E+01, -2.8393731159249540E+02, 5.1652001352543709E+02, 7.4578914842705018E+01, -1.1556759026365337E+03, 1.1556759026651935E+03, -7.4578914839714216E+01, -5.1652001352595710E+02, 2.8393731159268043E+02, -2.4199726682540959E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c8[] = {-1.0545675122360885E+01, -3.0306758891224317E+00, 7.2305523762173834E+01, -1.3808908570221064E+02, 7.6293213403386517E+01, 7.6293213419205742E+01, -1.3808908572505672E+02, 7.2305523760424833E+01, -3.0306758894244412E+00, -1.0545675122369961E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c9[] = {-2.1836930570474395E+00, 5.4992367509081630E+00, -4.5624617253163446E+00, -6.6492709819863256E+00, 2.0339240341691568E+01, -2.0339240351164950E+01, 6.6492710020476089E+00, 4.5624617253163446E+00, -5.4992367508501152E+00, 2.1836930570530630E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c10[] = {-9.1748741459757727E-02, 5.2562451739588611E-01, -1.4144257958835973E+00, 1.8629578990262812E+00, -9.0169874554123419E-01, -9.0169876258108816E-01, 1.8629579026113960E+00, -1.4144257947447987E+00, 5.2562451738534777E-01, -9.1748741464373396E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); + } else if constexpr(w==11) { + FLT c0[] = {4.3537972057094357E+03, 9.8872306817881018E+05, 2.0938056062983289E+07, 1.3701428307175827E+08, 3.8828289972017348E+08, 5.4292197128519189E+08, 3.8828289972017324E+08, 1.3701428307175821E+08, 2.0938056062983286E+07, 9.8872306817881099E+05, 4.3537972057093830E+03, 0.0000000000000000E+00}; + FLT c1[] = {1.7371472778611496E+04, 1.9155790709433770E+06, 2.4914432724618733E+07, 9.7792160665338323E+07, 1.3126779387874992E+08, 1.1003518489948497E-08, -1.3126779387874992E+08, -9.7792160665338367E+07, -2.4914432724618725E+07, -1.9155790709433774E+06, -1.7371472778611387E+04, 0.0000000000000000E+00}; + FLT c2[] = {2.9650558537745437E+04, 1.6014973065836846E+06, 1.1867448782239100E+07, 2.0812212822540633E+07, -1.1749875870571069E+07, -4.5121922350041404E+07, -1.1749875870571032E+07, 2.0812212822540659E+07, 1.1867448782239093E+07, 1.6014973065836851E+06, 2.9650558537745299E+04, 0.0000000000000000E+00}; + FLT c3[] = {2.8505604980264394E+04, 7.4166660874053277E+05, 2.5711466441825330E+06, -1.2146931938153899E+06, -8.3931576510116160E+06, -1.5221113764487218E-08, 8.3931576510117017E+06, 1.2146931938154220E+06, -2.5711466441825316E+06, -7.4166660874053324E+05, -2.8505604980264285E+04, 0.0000000000000000E+00}; + FLT c4[] = {1.7045632829988481E+04, 1.9785834209758078E+05, 8.6361403553701501E+04, -1.0584472412326147E+06, -1.3367486018960556E+05, 1.7818009619467217E+06, -1.3367486018952832E+05, -1.0584472412326441E+06, 8.6361403553699885E+04, 1.9785834209758087E+05, 1.7045632829988419E+04, 0.0000000000000000E+00}; + FLT c5[] = {6.5462464716912918E+03, 2.5347576368078855E+04, -7.5810878908805942E+04, -8.0774039751690128E+04, 2.5492801112955116E+05, 3.6655592491345995E-08, -2.5492801112950110E+05, 8.0774039751702396E+04, 7.5810878908810162E+04, -2.5347576368078677E+04, -6.5462464716912700E+03, 0.0000000000000000E+00}; + FLT c6[] = {1.5684149291082115E+03, -1.0302687059852267E+03, -1.3446845770824435E+04, 2.0814393480320545E+04, 1.4366994276523908E+04, -4.4581342385955380E+04, 1.4366994276463982E+04, 2.0814393480325110E+04, -1.3446845770824308E+04, -1.0302687059850016E+03, 1.5684149291082128E+03, 0.0000000000000000E+00}; + FLT c7[] = {1.9398419323286222E+02, -8.7329293867281388E+02, 2.4796533428938184E+02, 3.2905701326623416E+03, -4.8989871768459579E+03, 2.8861239463615327E-09, 4.8989871768722078E+03, -3.2905701326312101E+03, -2.4796533429068171E+02, 8.7329293867237629E+02, -1.9398419323287882E+02, 0.0000000000000000E+00}; + FLT c8[] = {-4.2288232505124679E+00, -9.9574929618003850E+01, 2.9563077146126534E+02, -1.9453049352240328E+02, -4.0107401572039475E+02, 7.9532514195009401E+02, -4.0107401576942334E+02, -1.9453049354949908E+02, 2.9563077145563869E+02, -9.9574929618160851E+01, -4.2288232505049734E+00, 0.0000000000000000E+00}; + FLT c9[] = {-5.3741131162167548E+00, 5.5350606003782072E+00, 1.9153744596147156E+01, -6.3189447483342484E+01, 6.6921287710344444E+01, 2.6543499136172006E-08, -6.6921287588490713E+01, 6.3189447458080132E+01, -1.9153744593546620E+01, -5.5350606004478644E+00, 5.3741131162113120E+00, 0.0000000000000000E+00}; + FLT c10[] = {-7.0359426508237854E-01, 2.2229112757468452E+00, -3.2054079720618520E+00, 8.3392526913327172E-02, 6.8879260281453520E+00, -1.0795498333352139E+01, 6.8879260220718077E+00, 8.3392507342704467E-02, -3.2054079702060019E+00, 2.2229112757257625E+00, -7.0359426507941902E-01, 0.0000000000000000E+00}; + FLT c11[] = {5.2648094861126392E-02, 9.9912561389764148E-02, -4.3913938527232693E-01, 7.9792987484770361E-01, -6.9191816827427566E-01, -1.2022534526020762E-09, 6.9191820562024531E-01, -7.9792984883890594E-01, 4.3913938443394634E-01, -9.9912561446925147E-02, -5.2648094869462925E-02, 0.0000000000000000E+00}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); + } else if constexpr(w==12) { + FLT c0[] = {6.4299692685485315E+03, 2.0077596413122714E+06, 5.4904521978991628E+07, 4.5946106674819350E+08, 1.6835469840840104E+09, 3.1308386544851556E+09, 3.1308386544851556E+09, 1.6835469840840099E+09, 4.5946106674819458E+08, 5.4904521978991754E+07, 2.0077596413122730E+06, 6.4299692685634491E+03}; + FLT c1[] = {2.6965848540274073E+04, 4.1625245902732178E+06, 7.2097002594596952E+07, 3.8505085985474640E+08, 7.9479013671674240E+08, 4.7870231281824082E+08, -4.7870231281824046E+08, -7.9479013671674252E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732178E+06, -2.6965848540258085E+04}; + FLT c2[] = {4.8869694409905111E+04, 3.7863371066322513E+06, 3.9530526716552719E+07, 1.1475134266581042E+08, 4.6311261797930710E+07, -2.0442837194260675E+08, -2.0442837194260725E+08, 4.6311261797930680E+07, 1.1475134266581020E+08, 3.9530526716552787E+07, 3.7863371066322504E+06, 4.8869694409920470E+04}; + FLT c3[] = {5.0530564260114021E+04, 1.9615784087727289E+06, 1.1044597342441007E+07, 7.9812418612436540E+06, -3.4042228324588493E+07, -3.3301805987927791E+07, 3.3301805987928167E+07, 3.4042228324588671E+07, -7.9812418612435497E+06, -1.1044597342440993E+07, -1.9615784087727286E+06, -5.0530564260099913E+04}; + FLT c4[] = {3.3081876469965493E+04, 6.2011956881368335E+05, 1.3086001239863748E+06, -3.1165484297367339E+06, -5.1982996003442882E+06, 6.3530947749618590E+06, 6.3530947749616513E+06, -5.1982996003444213E+06, -3.1165484297366543E+06, 1.3086001239863599E+06, 6.2011956881368288E+05, 3.3081876469981333E+04}; + FLT c5[] = {1.4308966168506788E+04, 1.1375573205951916E+05, -1.0318195403424598E+05, -6.6892418721462542E+05, 5.9223570255461533E+05, 1.1093685152673351E+06, -1.1093685152666988E+06, -5.9223570255418238E+05, 6.6892418721489178E+05, 1.0318195403424004E+05, -1.1375573205951886E+05, -1.4308966168492358E+04}; + FLT c6[] = {4.0848961919700960E+03, 7.5033277163528910E+03, -5.2578904182711594E+04, 6.3431596329919275E+03, 1.5984798504282799E+05, -1.2521363434070408E+05, -1.2521363434057294E+05, 1.5984798504289921E+05, 6.3431596327853522E+03, -5.2578904182714803E+04, 7.5033277163530738E+03, 4.0848961919843541E+03}; + FLT c7[] = {7.1658797373677544E+02, -1.5499947984100402E+03, -4.5490740453241297E+03, 1.4520122796414065E+04, -3.7896465826366048E+03, -2.3597107892645658E+04, 2.3597107892708405E+04, 3.7896465828577311E+03, -1.4520122796272850E+04, 4.5490740453326107E+03, 1.5499947984094520E+03, -7.1658797372277388E+02}; + FLT c8[] = {5.2022749592533359E+01, -4.0624258132650436E+02, 5.2256582980122801E+02, 9.3282469962834807E+02, -2.8710622267611107E+03, 1.7594166903207245E+03, 1.7594166904840572E+03, -2.8710622269566602E+03, 9.3282469973848731E+02, 5.2256582976889342E+02, -4.0624258132718376E+02, 5.2022749606062760E+01}; + FLT c9[] = {-7.0341875498860729E+00, -2.3043166229077922E+01, 1.2279331781679724E+02, -1.6714687548507158E+02, -4.4746498424591195E+01, 3.6060906024962412E+02, -3.6060905985137049E+02, 4.4746498852565225E+01, 1.6714687549695972E+02, -1.2279331779599295E+02, 2.3043166228938606E+01, 7.0341875614861786E+00}; + FLT c10[] = {-2.1556100132617875E+00, 4.1361104009993737E+00, 1.8107701723532290E+00, -2.1223400322208619E+01, 3.5820961861882218E+01, -1.8782945665578143E+01, -1.8782945409136026E+01, 3.5820961915195049E+01, -2.1223400242576908E+01, 1.8107701298380314E+00, 4.1361104007462801E+00, -2.1556100021452793E+00}; + FLT c11[] = {-1.1440899376747954E-01, 7.0567641591060326E-01, -1.4530217904770133E+00, 1.0571984613482723E+00, 1.4389002957406878E+00, -4.2241732762744180E+00, 4.2241733421252539E+00, -1.4389000664821670E+00, -1.0571984509828731E+00, 1.4530218285851431E+00, -7.0567641613924970E-01, 1.1440900438178304E-01}; + FLT c12[] = {-1.4486009663463860E-02, 2.9387825785034223E-03, -1.0265969715607470E-01, 2.6748267835596640E-01, -3.3606430399849180E-01, 1.5850148085005597E-01, 1.5850183161365292E-01, -3.3606448814949358E-01, 2.6748281866164947E-01, -1.0265975004478733E-01, 2.9387817050372631E-03, -1.4486000369842612E-02}; + for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); + } else if constexpr(w==13) { + FLT c0[] = {9.3397060605267689E+03, 3.9447202186643109E+06, 1.3701428307175812E+08, 1.4375660883001409E+09, 6.6384519128895693E+09, 1.5848048271166529E+10, 2.1031560281976665E+10, 1.5848048271166502E+10, 6.6384519128895674E+09, 1.4375660883001378E+09, 1.3701428307175812E+08, 3.9447202186642843E+06, 9.3397060605268125E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {4.0984512931817764E+04, 8.6828943763566799E+06, 1.9558432133067656E+08, 1.3674961320373521E+09, 3.9251291128182430E+09, 4.5116631434426517E+09, 4.8375356630808043E-07, -4.5116631434426460E+09, -3.9251291128182402E+09, -1.3674961320373492E+09, -1.9558432133067656E+08, -8.6828943763566278E+06, -4.0984512931817771E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {7.8379538318778985E+04, 8.4928073133582603E+06, 1.1992091153966437E+08, 5.0561697705436689E+08, 6.1845897311593950E+08, -5.1306326495404470E+08, -1.4790096327029374E+09, -5.1306326495404077E+08, 6.1845897311593986E+08, 5.0561697705436659E+08, 1.1992091153966436E+08, 8.4928073133582156E+06, 7.8379538318778927E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {8.6417670227040013E+04, 4.8250267333349697E+06, 3.9836803808039002E+07, 7.5026052902191013E+07, -7.7565422849560052E+07, -2.5393835488011825E+08, 5.1202971235247489E-07, 2.5393835488012013E+08, 7.7565422849558711E+07, -7.5026052902191967E+07, -3.9836803808039002E+07, -4.8250267333349511E+06, -8.6417670227039998E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {6.1161604972829380E+04, 1.7331203720075535E+06, 7.0216196997558968E+06, -3.6027138646117523E+06, -3.1775875626364492E+07, 1.6544480876790185E+06, 4.9816566960114852E+07, 1.6544480876808946E+06, -3.1775875626363728E+07, -3.6027138646113039E+06, 7.0216196997558847E+06, 1.7331203720075490E+06, 6.1161604972829351E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {2.9177164557155938E+04, 3.9318079134661221E+05, 3.1307448297760956E+05, -2.7571366584957433E+06, -9.8421840747392306E+05, 6.8469173866731795E+06, 2.9232946975263515E-06, -6.8469173866698397E+06, 9.8421840747792379E+05, 2.7571366584955421E+06, -3.1307448297758284E+05, -3.9318079134660971E+05, -2.9177164557155946E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {9.5097815505886610E+03, 4.8799940773716655E+04, -1.2734023162441862E+05, -2.5472337176564379E+05, 6.3596049196278059E+05, 2.2361868201841635E+05, -1.0716559939651759E+06, 2.2361868202218774E+05, 6.3596049196161982E+05, -2.5472337176485342E+05, -1.2734023162441724E+05, 4.8799940773713337E+04, 9.5097815505886447E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c7[] = {2.0601715730545379E+03, 1.9365931141472569E+02, -2.5304303117518622E+04, 2.9151392447034210E+04, 5.9055020355306144E+04, -1.1784846181665688E+05, 1.1400011168699383E-06, 1.1784846181507374E+05, -5.9055020356297522E+04, -2.9151392447480976E+04, 2.5304303117520958E+04, -1.9365931141621550E+02, -2.0601715730545466E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c8[] = {2.5975061893404052E+02, -1.0025387650583972E+03, -6.8642481194759603E+02, 6.7515314205452096E+03, -7.0772939650079616E+03, -6.5444514139847633E+03, 1.6566898963381227E+04, -6.5444514164662887E+03, -7.0772939638053231E+03, 6.7515314202341915E+03, -6.8642481198706810E+02, -1.0025387650556635E+03, 2.5975061893403893E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c9[] = {5.8705282128634133E+00, -1.4424362302822419E+02, 3.3390627215295177E+02, 4.8151337640374301E+01, -1.1431733953039347E+03, 1.4557114789663567E+03, 1.9301282133401762E-06, -1.4557114797747520E+03, 1.1431733969207255E+03, -4.8151337212400264E+01, -3.3390627213809154E+02, 1.4424362302302313E+02, -5.8705282128808269E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c10[] = {-4.0954969508936898E+00, -1.2634947188543673E+00, 3.8134139835466350E+01, -8.4115524781317148E+01, 4.2766848228448069E+01, 1.0573434411021174E+02, -1.9636661067694894E+02, 1.0573435394677749E+02, 4.2766846813968300E+01, -8.4115525213218916E+01, 3.8134139824669184E+01, -1.2634947158177201E+00, -4.0954969509055461E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c11[] = {-6.2702735486285888E-01, 1.8595467772479546E+00, -1.3027978470952948E+00, -4.9265265903267785E+00, 1.3906831953385087E+01, -1.3753762586104637E+01, 1.0604155239584518E-06, 1.3753756761963198E+01, -1.3906831509501583E+01, 4.9265273268806409E+00, 1.3027978586801867E+00, -1.8595467797630916E+00, 6.2702735486047489E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c12[] = {-4.8290636703364975E-02, 1.7531876505199090E-01, -5.0041292774701596E-01, 6.3665145473474949E-01, -1.2476811514471326E-02, -1.2061603189510861E+00, 1.8595308638696268E+00, -1.2061633355215959E+00, -1.2475969680262359E-02, 6.3665088474340670E-01, -5.0041295405456876E-01, 1.7531876799797264E-01, -4.8290636708721864E-02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c13[] = {2.2894665617766322E-02, -7.1358257229878720E-03, -1.4950743217821900E-02, 7.0611745711086651E-02, -1.2311302279978055E-01, 1.0342573392772816E-01, 5.7346192890547669E-07, -1.0342709034448951E-01, 1.2311300937219723E-01, -7.0611830251417942E-02, 1.4950741891648016E-02, 7.1358203725587141E-03, -2.2894665628191136E-02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); + } else if constexpr(w==14) { + FLT c0[] = {1.3368785683552904E+04, 7.5304732752870144E+06, 3.2765764524434990E+08, 4.2418096936485257E+09, 2.4197690538177525E+10, 7.2227640697189651E+10, 1.2261475327356714E+11, 1.2261475327356711E+11, 7.2227640697189682E+10, 2.4197690538177582E+10, 4.2418096936485257E+09, 3.2765764524435169E+08, 7.5304732752870200E+06, 1.3368785683578039E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c1[] = {6.1154444023081669E+04, 1.7488686085101541E+07, 5.0279014009863263E+08, 4.4777867842655849E+09, 1.6916819861812059E+10, 2.8971884004562843E+10, 1.6054555293734524E+10, -1.6054555293734529E+10, -2.8971884004562843E+10, -1.6916819861812090E+10, -4.4777867842655830E+09, -5.0279014009863406E+08, -1.7488686085101560E+07, -6.1154444023056145E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c2[] = {1.2279790808348049E+05, 1.8230319600271538E+07, 3.3815815633683985E+08, 1.9369899011251254E+09, 3.9743454154781203E+09, 7.4954544638351786E+08, -7.0173920607395000E+09, -7.0173920607395000E+09, 7.4954544638351130E+08, 3.9743454154781117E+09, 1.9369899011251252E+09, 3.3815815633684093E+08, 1.8230319600271557E+07, 1.2279790808350699E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c3[] = {1.4339321200624766E+05, 1.1200899688172188E+07, 1.2799140125169712E+08, 4.0176966726270604E+08, 7.9146174555810899E+07, -1.1719748245183561E+09, -9.6919138198233843E+08, 9.6919138198235476E+08, 1.1719748245183618E+09, -7.9146174555819452E+07, -4.0176966726270568E+08, -1.2799140125169776E+08, -1.1200899688172201E+07, -1.4339321200622554E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c4[] = {1.0866548538632700E+05, 4.4565213401510641E+06, 2.8354150929531462E+07, 2.2805067924009934E+07, -1.2058223609889300E+08, -1.2775415620368913E+08, 1.9261201640091014E+08, 1.9261201640090343E+08, -1.2775415620368628E+08, -1.2058223609888241E+08, 2.2805067924009915E+07, 2.8354150929531943E+07, 4.4565213401510660E+06, 1.0866548538635390E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c5[] = {5.6346565047794407E+04, 1.1743908345502375E+06, 3.0601086667309003E+06, -7.2274020134796975E+06, -1.6220595157143334E+07, 2.0773587344466623E+07, 2.8183198298701070E+07, -2.8183198298682313E+07, -2.0773587344454899E+07, 1.6220595157147046E+07, 7.2274020134809064E+06, -3.0601086667310768E+06, -1.1743908345502312E+06, -5.6346565047771022E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c6[] = {2.0435142564639598E+04, 1.9450977300078847E+05, -1.1234667576926883E+05, -1.5205767549240857E+06, 1.0515640561047094E+06, 3.7458351782500809E+06, -3.3794074240119159E+06, -3.3794074240111569E+06, 3.7458351782506104E+06, 1.0515640561079446E+06, -1.5205767549239916E+06, -1.1234667576914738E+05, 1.9450977300078212E+05, 2.0435142564663307E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c7[] = {5.1491366053560478E+03, 1.4735748500440239E+04, -8.1689482343683034E+04, -3.5176894225644079E+04, 3.7034248410400847E+05, -1.9109669530460562E+05, -5.2637978465735121E+05, 5.2637978465564619E+05, 1.9109669530912716E+05, -3.7034248412078863E+05, 3.5176894225852200E+04, 8.1689482343699274E+04, -1.4735748500439855E+04, -5.1491366053330485E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c8[] = {8.5138795113645585E+02, -1.2978618911733427E+03, -8.7500873646623440E+03, 2.1319159613970569E+04, 7.6586611605801199E+03, -6.2424139811455236E+04, 4.2620771487921840E+04, 4.2620771491440872E+04, -6.2424139815176597E+04, 7.6586611693937375E+03, 2.1319159613447209E+04, -8.7500873648877496E+03, -1.2978618911701635E+03, 8.5138795115875257E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c9[] = {7.2176142041616245E+01, -4.5543406155008586E+02, 2.8301959891624585E+02, 2.1994171513769957E+03, -4.5082500677203352E+03, 4.7658016853354945E+02, 7.1044827209848581E+03, -7.1044827023442112E+03, -4.7658015978385805E+02, 4.5082500694322307E+03, -2.1994171506161529E+03, -2.8301959873197922E+02, 4.5543406154525627E+02, -7.2176142022451799E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c10[] = {-3.1135380163286266E+00, -3.8554406982628045E+01, 1.4396028111579378E+02, -1.1260050352192819E+02, -3.0073665460436297E+02, 7.2079162225452933E+02, -4.1195308319958349E+02, -4.1195308907344031E+02, 7.2079162228692246E+02, -3.0073665296314113E+02, -1.1260050391063737E+02, 1.4396028095922969E+02, -3.8554406981953719E+01, -3.1135379980309104E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c11[] = {-1.6022934776950781E+00, 1.8678197421257499E+00, 8.3368944138930576E+00, -3.0791578217513287E+01, 3.4749712345962102E+01, 1.2322522680262193E+01, -7.3924006859338746E+01, 7.3924005395986399E+01, -1.2322518095091780E+01, -3.4749717239655702E+01, 3.0791578812609753E+01, -8.3368942651188451E+00, -1.8678197375527952E+00, 1.6022934952009980E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c12[] = {-1.9362061840948824E-01, 6.3024467669748396E-01, -9.3262278519229969E-01, -4.8908749318740480E-01, 4.0479376609320967E+00, -6.2829712900962678E+00, 3.1767825933699174E+00, 3.1767865219197975E+00, -6.2829777441520323E+00, 4.0479394849078085E+00, -4.8908801933495105E-01, -9.3262306580362497E-01, 6.3024467258732675E-01, -1.9362060312142931E-01, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c13[] = {1.8785913718903639E-02, 3.1605271252714680E-02, -1.3655798291459853E-01, 2.5016547139148904E-01, -1.6654308552073466E-01, -2.1682598043284024E-01, 6.1786085249849709E-01, -6.1785470804340159E-01, 2.1682794765059335E-01, 1.6654258378326353E-01, -2.5016523395036322E-01, 1.3655803190024704E-01, -3.1605272440421092E-02, -1.8785905282938619E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + FLT c14[] = {-1.2896545140952162E-02, -3.7106972352948116E-03, 5.8857860695711909E-04, 1.3987176343065890E-02, -3.5714007561179102E-02, 4.3401590960273219E-02, -2.0034532372716081E-02, -2.0038454375630149E-02, 4.3401322628411031E-02, -3.5713348533616053E-02, 1.3987046090052241E-02, 5.8856319054218355E-04, -3.7106979912720915E-03, -1.2896537385752806E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); + } else if constexpr(w==15) { + FLT c0[] = {1.8887777774374499E+04, 1.4015330434461417E+07, 7.5498683300180018E+08, 1.1900937739619951E+10, 8.2530965279375351E+10, 3.0178246269069604E+11, 6.3775691457119104E+11, 8.1471473119305554E+11, 6.3775691457119116E+11, 3.0178246269069641E+11, 8.2530965279375519E+10, 1.1900937739619963E+10, 7.5498683300180054E+08, 1.4015330434461435E+07, 1.8887777774374488E+04, 0.0000000000000000E+00}; + FLT c1[] = {8.9780907163796335E+04, 3.4167636285297148E+07, 1.2346880033823481E+09, 1.3719272724135921E+10, 6.5858241494816696E+10, 1.5266999939989539E+11, 1.5687794513790723E+11, -2.8523584844088883E-05, -1.5687794513790732E+11, -1.5266999939989545E+11, -6.5858241494816811E+10, -1.3719272724135933E+10, -1.2346880033823476E+09, -3.4167636285297163E+07, -8.9780907163796335E+04, 0.0000000000000000E+00}; + FLT c2[] = {1.8850321233130712E+05, 3.7693640983013541E+07, 8.9846818051570034E+08, 6.7094088040439653E+09, 1.9743296615199215E+10, 1.8072727219391140E+10, -2.0634615374559410E+10, -4.9654335197177498E+10, -2.0634615374559414E+10, 1.8072727219391048E+10, 1.9743296615199223E+10, 6.7094088040439672E+09, 8.9846818051570022E+08, 3.7693640983013526E+07, 1.8850321233130703E+05, 0.0000000000000000E+00}; + FLT c3[] = {2.3185006533495727E+05, 2.4789475362741601E+07, 3.7751696829092383E+08, 1.7167916788178182E+09, 1.9832401267745295E+09, -3.4881359830884194E+09, -7.8785602379628601E+09, 6.6906528952995499E-05, 7.8785602379629536E+09, 3.4881359830884261E+09, -1.9832401267745163E+09, -1.7167916788178096E+09, -3.7751696829092425E+08, -2.4789475362741597E+07, -2.3185006533495730E+05, 0.0000000000000000E+00}; + FLT c4[] = {1.8672970114818285E+05, 1.0741068109706732E+07, 9.8017949708492473E+07, 2.0291084954252145E+08, -2.7857869294214898E+08, -9.4112677968756318E+08, 1.7886520649334356E+08, 1.4579673547891481E+09, 1.7886520649344125E+08, -9.4112677968753338E+08, -2.7857869294217581E+08, 2.0291084954251301E+08, 9.8017949708492488E+07, 1.0741068109706739E+07, 1.8672970114818282E+05, 0.0000000000000000E+00}; + FLT c5[] = {1.0411891611891470E+05, 3.1771463075269456E+06, 1.4880104152842037E+07, -6.8136965447538150E+06, -8.7072998215422541E+07, 1.8024116530863210E+06, 1.9067730799615666E+08, 1.2078175959365315E-04, -1.9067730799603686E+08, -1.8024116529155241E+06, 8.7072998215445980E+07, 6.8136965447565373E+06, -1.4880104152841812E+07, -3.1771463075269484E+06, -1.0411891611891470E+05, 0.0000000000000000E+00}; + FLT c6[] = {4.1300641422694731E+04, 6.3217168592497683E+05, 7.7343707634845132E+05, -5.4575962381476769E+06, -3.7387211063063843E+06, 1.8451583614082869E+07, 3.0480804948189310E+06, -2.7500445095872246E+07, 3.0480804948457484E+06, 1.8451583614064269E+07, -3.7387211062890980E+06, -5.4575962381450543E+06, 7.7343707634841127E+05, 6.3217168592497602E+05, 4.1300641422694724E+04, 0.0000000000000000E+00}; + FLT c7[] = {1.1710443348523711E+04, 7.5405449195716908E+04, -1.6634736996487752E+05, -5.6069290801842115E+05, 1.1540571563940533E+06, 1.0209821660925965E+06, -2.9641921942009293E+06, -7.3770236318814628E-06, 2.9641921942630685E+06, -1.0209821662946860E+06, -1.1540571563987043E+06, 5.6069290801928868E+05, 1.6634736996459437E+05, -7.5405449195719295E+04, -1.1710443348523739E+04, 0.0000000000000000E+00}; + FLT c8[] = {2.3142324239350210E+03, 2.1710560541703007E+03, -3.6929625713151705E+04, 2.6143898219588682E+04, 1.4046980090353978E+05, -2.1033190114896413E+05, -1.1132269819276403E+05, 3.7491447373940505E+05, -1.1132269820720138E+05, -2.1033190120894444E+05, 1.4046980085134835E+05, 2.6143898217223435E+04, -3.6929625713258414E+04, 2.1710560541651053E+03, 2.3142324239349791E+03, 0.0000000000000000E+00}; + FLT c9[] = {2.8879718294281940E+02, -9.2801372612866078E+02, -1.9817144428357562E+03, 9.9004179214302640E+03, -5.7928268996319048E+03, -2.1083466266548403E+04, 3.3285502001854453E+04, 1.3615676123196788E-04, -3.3285501884684672E+04, 2.1083466388283239E+04, 5.7928269528908959E+03, -9.9004179214302640E+03, 1.9817144428357562E+03, 9.2801372612624596E+02, -2.8879718294281940E+02, 0.0000000000000000E+00}; + FLT c10[] = {1.3121871131759899E+01, -1.5978845118014243E+02, 2.7429718889479011E+02, 4.4598059431432415E+02, -1.8917609556521720E+03, 1.5303002256342920E+03, 1.7542368404254241E+03, -3.9411530187890685E+03, 1.7542368839611659E+03, 1.5303002335812619E+03, -1.8917609760379448E+03, 4.4598059250034765E+02, 2.7429718872202716E+02, -1.5978845118149314E+02, 1.3121871131760223E+01, 0.0000000000000000E+00}; + FLT c11[] = {-2.4286151057622600E+00, -6.7839829150137421E+00, 4.6999223003107119E+01, -7.4896070454665107E+01, -3.2010110856873055E+01, 2.5022929107925501E+02, -2.8786053481345135E+02, 1.4424367379967129E-05, 2.8786057555317575E+02, -2.5022937123192844E+02, 3.2010139421505684E+01, 7.4896073537460509E+01, -4.6999223012862650E+01, 6.7839829186720362E+00, 2.4286151057336860E+00, 0.0000000000000000E+00}; + FLT c12[] = {-5.4810555665671257E-01, 1.1436870859674571E+00, 8.2471504792547190E-01, -8.5602131787584241E+00, 1.5631631237511966E+01, -6.4979395997142886E+00, -1.8737629118679905E+01, 3.3283673647767003E+01, -1.8737705444926284E+01, -6.4980552114725620E+00, 1.5631576798962341E+01, -8.5602158445716778E+00, 8.2471481116140977E-01, 1.1436870769250529E+00, -5.4810555667406624E-01, 0.0000000000000000E+00}; + FLT c13[] = {-1.4554612891837512E-02, 1.7022157398269799E-01, -3.7563892964814216E-01, 2.0131145240492249E-01, 8.3554123561642435E-01, -2.1191317631421946E+00, 1.9961007770939201E+00, 5.0230495487029605E-05, -1.9960655197919825E+00, 2.1191435815870405E+00, -8.3552330614378623E-01, -2.0131363341395125E-01, 3.7563890238546094E-01, -1.7022157734534860E-01, 1.4554612875194470E-02, 0.0000000000000000E+00}; + FLT c14[] = {-1.2348455978815665E-02, 2.6143485494326945E-03, -2.9252290291144727E-02, 7.5392101552106419E-02, -8.7986538697867239E-02, 1.3073120666751545E-03, 1.5251801232957554E-01, -2.3235618419546245E-01, 1.5253703942622115E-01, 1.3217162898956957E-03, -8.7999818995735196E-02, 7.5391507930594778E-02, -2.9252395603998178E-02, 2.6143483927929994E-03, -1.2348455970768767E-02, 0.0000000000000000E+00}; + FLT c15[] = {1.4214685591273772E-02, -1.2364346992375923E-03, 1.2892328724708124E-03, 1.6178725688327468E-03, -8.2104229475896996E-03, 1.3914679473447157E-02, -1.1426959041713501E-02, 1.6590583007947697E-05, 1.1446333966460217E-02, -1.3912124902889801E-02, 8.2298310485774198E-03, -1.6155336438419190E-03, -1.2892162843503102E-03, 1.2364372911314208E-03, -1.4214685607473108E-02, 0.0000000000000000E+00}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); + } else if constexpr(w==16) { + FLT c0[] = {2.6374086784014689E+04, 2.5501413681212645E+07, 1.6835469840840099E+09, 3.1953580806547867E+10, 2.6584910126662766E+11, 1.1715858191494619E+12, 3.0181658330343120E+12, 4.7888775408612773E+12, 4.7888775408612764E+12, 3.0181658330343125E+12, 1.1715858191494619E+12, 2.6584910126662772E+11, 3.1953580806547874E+10, 1.6835469840840104E+09, 2.5501413681212656E+07, 2.6374086784014886E+04}; + FLT c1[] = {1.2991568388123445E+05, 6.4986154651133664E+07, 2.9142305012947259E+09, 3.9748054433728149E+10, 2.3649443248440247E+11, 7.0471088240421252E+11, 1.0533888905987031E+12, 5.4832304482297632E+11, -5.4832304482297687E+11, -1.0533888905987034E+12, -7.0471088240421265E+11, -2.3649443248440250E+11, -3.9748054433728149E+10, -2.9142305012947259E+09, -6.4986154651133649E+07, -1.2991568388123448E+05}; + FLT c2[] = {2.8421223836872831E+05, 7.5448503558118582E+07, 2.2710828032883868E+09, 2.1491603403163826E+10, 8.4299374042308136E+10, 1.3384457365769528E+11, 1.8630012765531485E+09, -2.4384536789321179E+11, -2.4384536789321094E+11, 1.8630012765532806E+09, 1.3384457365769531E+11, 8.4299374042308090E+10, 2.1491603403163826E+10, 2.2710828032883863E+09, 7.5448503558118552E+07, 2.8421223836872820E+05}; + FLT c3[] = {3.6653021243297518E+05, 5.2693428548387080E+07, 1.0410094433021281E+09, 6.3986267576853533E+09, 1.3313926739756302E+10, -2.7909761561128025E+09, -3.9911638977027977E+10, -2.9236947704012939E+10, 2.9236947704012939E+10, 3.9911638977028267E+10, 2.7909761561128430E+09, -1.3313926739756279E+10, -6.3986267576853561E+09, -1.0410094433021276E+09, -5.2693428548387088E+07, -3.6653021243297518E+05}; + FLT c4[] = {3.1185660915838118E+05, 2.4564274645530280E+07, 3.0509279143241835E+08, 1.0432225146182569E+09, 6.4966284440222360E+07, -4.2483903608016477E+09, -3.1778261722524829E+09, 5.9880587942832708E+09, 5.9880587942832832E+09, -3.1778261722526174E+09, -4.2483903608017979E+09, 6.4966284440235756E+07, 1.0432225146182607E+09, 3.0509279143241805E+08, 2.4564274645530272E+07, 3.1185660915838124E+05}; + FLT c5[] = {1.8544733523229562E+05, 7.9824949938292839E+06, 5.6880943382648192E+07, 5.4097201999258779E+07, -3.0776449202833223E+08, -3.7659931821867347E+08, 6.8797698944719648E+08, 7.5429896889866996E+08, -7.5429896889781320E+08, -6.8797698944658160E+08, 3.7659931821898031E+08, 3.0776449202837497E+08, -5.4097201999252096E+07, -5.6880943382647842E+07, -7.9824949938292857E+06, -1.8544733523229562E+05}; + FLT c6[] = {7.9472339236673259E+04, 1.8159676553648398E+06, 5.7259818806751696E+06, -1.2786136236423338E+07, -3.8677490873147681E+07, 4.7651450515707508E+07, 9.0723760109202415E+07, -9.4532949239946112E+07, -9.4532949239604995E+07, 9.0723760109522834E+07, 4.7651450515667401E+07, -3.8677490873160362E+07, -1.2786136236416934E+07, 5.7259818806752721E+06, 1.8159676553648538E+06, 7.9472339236673215E+04}; + FLT c7[] = {2.4831718998299857E+04, 2.7536301841716090E+05, -5.1045953356025166E+04, -2.6996387880239477E+06, 1.1656554632125401E+06, 9.1521923449522462E+06, -6.8198180925621921E+06, -1.2555197000954127E+07, 1.2555197001087580E+07, 6.8198180925775450E+06, -9.1521923449367471E+06, -1.1656554632051867E+06, 2.6996387880183556E+06, 5.1045953355832869E+04, -2.7536301841717580E+05, -2.4831718998299897E+04}; + FLT c8[] = {5.6060763597396035E+03, 2.2154740880101843E+04, -1.0243462874810334E+05, -1.1802198892388590E+05, 6.4061699367506150E+05, -1.1166716749369531E+05, -1.4153578101923370E+06, 1.0790712965214122E+06, 1.0790712965802078E+06, -1.4153578102569627E+06, -1.1166716767280686E+05, 6.4061699367841065E+05, -1.1802198892652121E+05, -1.0243462874831920E+05, 2.2154740880096295E+04, 5.6060763597396262E+03}; + FLT c9[] = {8.7271993222049730E+02, -7.0074676859193858E+02, -1.2528372958474913E+04, 2.3643101054370443E+04, 3.1699060146436736E+04, -1.1270133578294520E+05, 3.6872846840416030E+04, 1.5168911768972370E+05, -1.5168911672801850E+05, -3.6872846329129716E+04, 1.1270133600206790E+05, -3.1699060140349993E+04, -2.3643101053229180E+04, 1.2528372958403583E+04, 7.0074676858840917E+02, -8.7271993222049730E+02}; + FLT c10[] = {7.8842259458727298E+01, -4.2070880913717718E+02, -1.0535142166729695E+02, 3.3375056757602101E+03, -4.9426353709826744E+03, -3.6567309465694352E+03, 1.5199085032737788E+04, -9.4972226150681072E+03, -9.4972224492176338E+03, 1.5199085307902486E+04, -3.6567309714471071E+03, -4.9426353751288962E+03, 3.3375056795609726E+03, -1.0535142205602271E+02, -4.2070880913447866E+02, 7.8842259458701932E+01}; + FLT c11[] = {8.9833076760252317E-02, -4.4163371177310189E+01, 1.2880771175011134E+02, 2.8722208980881483E+00, -5.7164632401064989E+02, 9.0417621054583299E+02, 1.1221311957018894E+00, -1.4190922684153286E+03, 1.4190926436578332E+03, -1.1219382673482139E+00, -9.0417616902565715E+02, 5.7164633587355513E+02, -2.8722219907225899E+00, -1.2880771149646372E+02, 4.4163371174871045E+01, -8.9833076793553943E-02}; + FLT c12[] = {-1.0900468357304585E+00, -1.1264666580175993E-01, 1.1810668498718398E+01, -3.0289105594116332E+01, 1.5494599855921946E+01, 6.0130016326899806E+01, -1.2330195579557967E+02, 6.7114292010484860E+01, 6.7114238133033894E+01, -1.2330200967294053E+02, 6.0129899592769000E+01, 1.5494588631452897E+01, -3.0289108821162568E+01, 1.1810668060273379E+01, -1.1264668224327026E-01, -1.0900468357482698E+00}; + FLT c13[] = {-1.1763610124684608E-01, 4.2939195551308978E-01, -2.7950231695310290E-01, -1.7354597875532083E+00, 5.1181749794184972E+00, -5.0538409872852545E+00, -2.1268758321444312E+00, 1.0709572497394593E+01, -1.0709247944735344E+01, 2.1270284132327628E+00, 5.0538814533614023E+00, -5.1181783143082038E+00, 1.7354587260576941E+00, 2.7950208340719496E-01, -4.2939195720020440E-01, 1.1763610121354666E-01}; + FLT c14[] = {-1.8020499708490779E-02, 3.6694576081450124E-02, -1.1331174689418615E-01, 1.3970801507325420E-01, 8.1708800731612838E-02, -5.4465632012605969E-01, 7.9628723318194716E-01, -3.9045387765910361E-01, -3.9034731591396871E-01, 7.9641679205120786E-01, -5.4465236519348836E-01, 8.1709687544577886E-02, 1.3970913694934384E-01, -1.1331198385459386E-01, 3.6694575058947500E-02, -1.8020499699434717E-02}; + FLT c15[] = {1.4589783457723899E-02, -7.8885273589694921E-04, -4.4854775481901451E-03, 1.8117810622567232E-02, -3.0563678378015532E-02, 1.9027105036022670E-02, 2.4778670881552757E-02, -6.7767913155521747E-02, 6.7979444868167399E-02, -2.4638534439549119E-02, -1.8992900331546877E-02, 3.0569915511324409E-02, -1.8117279802711158E-02, 4.4857097818771776E-03, 7.8885377265448060E-04, -1.4589783469873403E-02}; + FLT c16[] = {-1.0467998068898355E-02, -3.2140568385029999E-04, 5.2979866592800886E-04, -1.5800624712947203E-04, -1.4200041949817279E-03, 3.7626007108648857E-03, -3.8348321381240775E-03, 1.6547563335740942E-03, 1.5759584129276946E-03, -3.8873640852216617E-03, 3.7166352571544989E-03, -1.4265706883689335E-03, -1.5923746463956793E-04, 5.2952292450647511E-04, -3.2141610431099765E-04, -1.0467998084554094E-02}; + for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); + } else + printf("width not implemented!\n"); diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index edd25adfb..8e55eab4a 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -31,7 +31,13 @@ int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, { FINUFFT_PLAN plan; int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan, - popts); // popts (ptr to opts) can be NULL + popts); // popts + // (ptr + // to + // opts) + // can + // be + // NULL if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); delete plan; diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index e6f8eaba9..86566403e 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -6,6 +6,10 @@ #include #include +#include "ker_horner_allw_loop_constexpr.h" +#include + +#include #include #include #include @@ -14,45 +18,78 @@ using namespace std; using namespace finufft::utils; // access to timer -namespace finufft { -namespace spreadinterp { - +namespace finufft::spreadinterp { + +namespace { // anonymous namespace for internal structs equivalent to declaring everything + // static +struct zip_low; +struct zip_hi; +// forward declaration to clean up the code and be able to use this everywhere in the file +template static constexpr auto BestSIMDHelper(); +template constexpr auto GetPaddedSIMDSize(); +template +using PaddedSIMD = typename xsimd::make_sized_batch()>::type; +template uint16_t get_padding(uint8_t ns); +template constexpr auto get_padding(); +template +using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; +template constexpr uint16_t min_batch_size(); +template constexpr auto find_optimal_batch_size(); +template +constexpr auto initialize_complex_batch(V a, V b) noexcept; +template +constexpr auto zip_low_index = + xsimd::make_batch_constant, arch_t, zip_low>(); +template +constexpr auto zip_hi_index = + xsimd::make_batch_constant, arch_t, zip_hi>(); +} // namespace // declarations of purely internal functions... (thus need not be in .h) +template()>, + typename... V> +static FINUFFT_ALWAYS_INLINE auto ker_eval(const finufft_spread_opts &opts, + const V... elems) noexcept; static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N) noexcept; -static FINUFFT_ALWAYS_INLINE void set_kernel_args(FLT *args, FLT x, - const finufft_spread_opts &opts); +static FINUFFT_ALWAYS_INLINE void set_kernel_args( + FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( - FLT *ker, FLT *args, const finufft_spread_opts &opts, const int N); -static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner(FLT *ker, FLT z, const int w, - const finufft_spread_opts &opts); -void interp_line(FLT *out, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns); -void interp_square(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2, - BIGINT N1, BIGINT N2, int ns); -void interp_cube(FLT *out, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1, BIGINT i2, - BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns); -void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du0, BIGINT M0, FLT *kx0, - FLT *dd0, const finufft_spread_opts &opts); -void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, FLT *du0, - BIGINT M0, FLT *kx0, FLT *ky0, FLT *dd0, - const finufft_spread_opts &opts); -void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, - BIGINT size2, BIGINT size3, FLT *du0, BIGINT M0, FLT *kx0, - FLT *ky0, FLT *kz0, FLT *dd0, const finufft_spread_opts &opts); -void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT size1, - BIGINT size2, BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, FLT *du0); -void add_wrapped_subgrid_thread_safe(BIGINT offset1, BIGINT offset2, BIGINT offset3, - BIGINT size1, BIGINT size2, BIGINT size3, BIGINT N1, - BIGINT N2, BIGINT N3, FLT *data_uniform, FLT *du0); -void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1, - BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y, - double bin_size_z, int debug); + FLT *ker, FLT *args, const finufft_spread_opts &opts, int N) noexcept; +static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( + FLT *ker, FLT x, int w, const finufft_spread_opts &opts) noexcept; +template()>> // aka ns +static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( + FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; +static void interp_line(FLT *out, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns); +static void interp_square(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2, + BIGINT N1, BIGINT N2, int ns); +static void interp_cube(FLT *out, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1, + BIGINT i2, BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns); +static void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du0, BIGINT M0, FLT *kx0, + FLT *dd0, const finufft_spread_opts &opts) noexcept; +static void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, + FLT *FINUFFT_RESTRICT du, BIGINT M, const FLT *kx, + const FLT *ky, const FLT *dd, + const finufft_spread_opts &opts) noexcept; +static void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, + BIGINT size2, BIGINT size3, FLT *du0, BIGINT M0, + FLT *kx0, FLT *ky0, FLT *kz0, FLT *dd0, + const finufft_spread_opts &opts) noexcept; +template +static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, + BIGINT padded_size1, BIGINT size1, BIGINT size2, + BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const FLT *du0); +static void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + BIGINT N1, BIGINT N2, BIGINT N3, double bin_size_x, + double bin_size_y, double bin_size_z, int debug); void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGINT N1, BIGINT N2, BIGINT N3, double bin_size_x, double bin_size_y, double bin_size_z, int debug, int nthr); -void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size1, - BIGINT &size2, BIGINT &size3, BIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0, - int ns, int ndims); +static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, + BIGINT &padded_size1, BIGINT &size1, BIGINT &size2, BIGINT &size3, + BIGINT M0, FLT *kx0, FLT *ky0, FLT *kz0, int ns, int ndims); // ========================================================================== int spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, @@ -93,13 +130,13 @@ int spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, F Inputs: N1,N2,N3 - grid sizes in x (fastest), y (medium), z (slowest) respectively. If N2==1, 1D spreading is done. If N3==1, 2D spreading. - Otherwise, 3D. + Otherwise, 3D. M - number of NU pts. kx, ky, kz - length-M real arrays of NU point coordinates (only kx read in 1D, only kx and ky read in 2D). - These should lie in the box -pi<=kx<=pi. Points outside this domain are also - correctly folded back into this domain. + These should lie in the box -pi<=kx<=pi. Points outside this domain are also + correctly folded back into this domain. opts - spread/interp options struct, documented in ../include/finufft_spread_opts.h Inputs/Outputs: @@ -308,8 +345,7 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // Splits sorted inds (jfm's advanced2), could double RAM. // choose nb (# subprobs) via used nthreads: int nb = min((BIGINT)nthr, M); // simply split one subprob per thr... - if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap - // size + if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does // ceil(M/opts.max_subproblem_size) if (opts.debug) @@ -346,48 +382,53 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part } // get the subgrid which will include padding by roughly nspread/2 - BIGINT offset1, offset2, offset3, size1, size2, size3; // get_subgrid sets - get_subgrid(offset1, offset2, offset3, size1, size2, size3, M0, kx0, ky0, kz0, ns, - ndims); // sets offsets and sizes - if (opts.debug > 1) { // verbose + BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; // get_subgrid + // sets + get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, kx0, + ky0, kz0, ns, + ndims); // sets offsets and sizes + if (opts.debug > 1) { // verbose + printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); if (ndims == 1) printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, - (long long)size1, (long long)M0); + (long long)padded_size1, (long long)M0); else if (ndims == 2) printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)size1, + (long long)offset1, (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); else printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", (long long)offset1, (long long)offset2, (long long)offset3, - (long long)size1, (long long)size2, (long long)size3, (long long)M0); + (long long)padded_size1, (long long)size2, (long long)size3, + (long long)M0); } // allocate output data for this subgrid - FLT *du0 = (FLT *)malloc(sizeof(FLT) * 2 * size1 * size2 * size3); // complex + FLT *du0 = (FLT *)malloc(sizeof(FLT) * 2 * padded_size1 * size2 * size3); // complex // Spread to subgrid without need for bounds checking or wrapping if (!(opts.flags & TF_OMIT_SPREADING)) { if (ndims == 1) - spread_subproblem_1d(offset1, size1, du0, M0, kx0, dd0, opts); + spread_subproblem_1d(offset1, padded_size1, du0, M0, kx0, dd0, opts); else if (ndims == 2) - spread_subproblem_2d(offset1, offset2, size1, size2, du0, M0, kx0, ky0, dd0, - opts); + spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0, M0, kx0, ky0, + dd0, opts); else - spread_subproblem_3d(offset1, offset2, offset3, size1, size2, size3, du0, M0, - kx0, ky0, kz0, dd0, opts); + spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, du0, + M0, kx0, ky0, kz0, dd0, opts); } // do the adding of subgrid to output if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { if (nthr > opts.atomic_threshold) // see above for debug reporting - add_wrapped_subgrid_thread_safe(offset1, offset2, offset3, size1, size2, size3, - N1, N2, N3, data_uniform, du0); // R Blackwell's - // atomic - // version + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, + size3, N1, N2, N3, data_uniform, du0); // R + // Blackwell's + // atomic + // version else { #pragma omp critical - add_wrapped_subgrid(offset1, offset2, offset3, size1, size2, size3, N1, N2, N3, - data_uniform, du0); + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, du0); } } @@ -440,8 +481,8 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // Loop over interpolation chunks #pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: - for (BIGINT i = 0; i < M; i += CHUNKSIZE) // main loop over NU targs, interp each - // from U + for (BIGINT i = 0; i < M; i += CHUNKSIZE) // main loop over NU targs, interp each from + // U { // Setup buffers for this chunk int bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; @@ -463,10 +504,8 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ BIGINT i1 = (BIGINT)std::ceil(xj - ns2); // leftmost grid index - BIGINT i2 = (ndims > 1) ? (BIGINT)std::ceil(yj - ns2) : 0; // min y grid - // index - BIGINT i3 = (ndims > 2) ? (BIGINT)std::ceil(zj - ns2) : 0; // min z grid - // index + BIGINT i2 = (ndims > 1) ? (BIGINT)std::ceil(yj - ns2) : 0; // min y grid index + BIGINT i3 = (ndims > 2) ? (BIGINT)std::ceil(zj - ns2) : 0; // min z grid index FLT x1 = (FLT)i1 - xj; // shift of ker center, in [-w/2,-w/2+1] FLT x2 = (ndims > 1) ? (FLT)i2 - yj : 0; @@ -484,9 +523,6 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat } else { - // due to ns being padded up to next multiple of 4 in the - // eval_kernel_vec_Horner and writing zeros out to this padded - // size, these must occur in the order x,y,z... eval_kernel_vec_Horner(ker1, x1, ns, opts); if (ndims > 1) eval_kernel_vec_Horner(ker2, x2, ns, opts); if (ndims > 2) eval_kernel_vec_Horner(ker3, x3, ns, opts); @@ -540,8 +576,8 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker if (upsampfac != 2.0 && upsampfac != 1.25) { // nonstandard sigma if (kerevalmeth == 1) { fprintf(stderr, - "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be " - "handled by kerevalmeth=1\n", + "FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by " + "kerevalmeth=1\n", upsampfac); return FINUFFT_ERR_HORNER_WRONG_BETA; } @@ -589,14 +625,13 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules if (showwarn) fprintf(stderr, - "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width " - "ns=%d; clipping to max %d.\n", + "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; " + "clipping to max %d.\n", __func__, upsampfac, (double)eps, ns, MAX_NSPREAD); ns = MAX_NSPREAD; ier = FINUFFT_WARN_EPS_TOO_SMALL; } opts.nspread = ns; - // setup for reference kernel eval (via formula): select beta width param... // (even when kerevalmeth=1, this ker eval needed for FTs in onedim_*_kernel) opts.ES_halfwidth = (double)ns / 2; // constants to help (see below routines) @@ -607,8 +642,7 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int ker if (ns == 4) betaoverns = 2.38; if (upsampfac != 2.0) { // again, override beta for custom sigma FLT gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on - // cutoff + betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff } opts.ES_beta = betaoverns * ns; // set the kernel beta parameter if (debug) @@ -633,7 +667,7 @@ FLT evaluate_kernel(FLT x, const finufft_spread_opts &opts) return exp((FLT)opts.ES_beta * sqrt((FLT)1.0 - (FLT)opts.ES_c * x * x)); } -static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts &opts) +void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts &opts) noexcept // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. // needed for the vectorized kernel eval of Ludvig af K. { @@ -641,8 +675,8 @@ static inline void set_kernel_args(FLT *args, FLT x, const finufft_spread_opts & for (int i = 0; i < ns; i++) args[i] = x + (FLT)i; } -static inline void evaluate_kernel_vector(FLT *ker, FLT *args, - const finufft_spread_opts &opts, const int N) +void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts, + const int N) noexcept /* Evaluate ES kernel for a vector of N arguments; by Ludvig af K. If opts.kerpad true, args and ker must be allocated for Npad, and args is written to (to pad to length Npad), only first N outputs are correct. @@ -678,15 +712,85 @@ static inline void evaluate_kernel_vector(FLT *ker, FLT *args, if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; } -static inline void eval_kernel_vec_Horner(FLT *ker, const FLT x, const int w, - const finufft_spread_opts &opts) +template +constexpr std::array pad_with_zeros(const std::array &input) { + std::array output{0}; + for (auto i = 0; i < N; ++i) { + output[i] = input[i]; + } + return output; +} + +template +constexpr std::array, N> pad_2D_array_with_zeros( + std::array, N> &&input) { + std::array, N> output{}; + for (std::size_t i = 0; i < N; ++i) { + output[i] = pad_with_zeros(input[i]); + } + return output; +} + +template // aka ns +void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, + const finufft_spread_opts &opts) noexcept +/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at +x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. +This is the current evaluation method, since it's faster (except i7 w=16). +Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ +{ + const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in + // [-1,1] + if (opts.upsampfac == 2.0) { // floating point equality is fine here + static constexpr const auto nc = []() { + static_assert(w <= 16, "w must be <= 16"); + static_assert(w >= 2, "w must be >= 2"); + if constexpr (w < 10) { + return w + 3; + } else { + return w + 2; + } + }(); + static constexpr const auto alignment = batch_t::arch_type::alignment(); + static constexpr const auto avx_size = batch_t::size; + static constexpr const auto padded_ns = (w + avx_size - 1) & ~(avx_size - 1); + alignas(alignment) static constexpr const auto ci = + pad_2D_array_with_zeros(get_horner_coeffs()); + alignas(alignment) const std::array zs = [](const FLT z) noexcept { + std::array zs_v{}; + auto sz = z; + for (uint8_t i = 0; i < nc - 1; ++i) { + zs_v[i] = batch_t(sz); + sz *= z; + } + return zs_v; + }(z); + for (uint8_t i = 0; i < w; i += avx_size) { + auto k = batch_t::load_aligned(ci[0].data() + i); + for (uint8_t j = 1; j < nc; ++j) { + const auto cji = batch_t::load_aligned(ci[j].data() + i); + k = xsimd::fma(cji, zs[j - 1], k); + } + k.store_aligned(ker + i); + } + return; + } + // insert the auto-generated code which expects z, w args, writes to ker... + if (opts.upsampfac == 1.25) { +#include "ker_lowupsampfac_horner_allw_loop_constexpr.c" + return; + } +} +void eval_kernel_vec_Horner(FLT *ker, const FLT x, const int w, + const finufft_spread_opts &opts) noexcept /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ { if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { - FLT z = (FLT)2.0 * x + w - (FLT)1.0; // scale so local grid offset z in [-1,1] + const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in + // [-1,1] // insert the auto-generated code which expects z, w args, writes to ker... if (opts.upsampfac == 2.0) { // floating point equality is fine here #include "ker_horner_allw_loop.c" @@ -871,9 +975,8 @@ void interp_cube(FLT *target, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i for (int dz = 0; dz < ns; dz++) { BIGINT oz = N1 * N2 * (i3 + dz); // offset due to z for (int dy = 0; dy < ns; dy++) { - const FLT *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of - // line - FLT ker23 = ker2[dy] * ker3[dz]; + const FLT *lptr = du + 2 * (oz + N1 * (i2 + dy) + i1); // ptr start of line + FLT ker23 = ker2[dy] * ker3[dz]; for (int l = 0; l < 2 * ns; ++l) { // loop over ns interleaved (R,I) pairs line[l] += lptr[l] * ker23; } @@ -918,237 +1021,455 @@ void interp_cube(FLT *target, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i target[1] = out[1]; } -void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du, BIGINT M, FLT *kx, FLT *dd, - const finufft_spread_opts &opts) -/* 1D spreader from nonuniform to uniform subproblem grid, without wrapping. - Inputs: - off1 - integer offset of left end of du subgrid from that of overall fine - periodized output grid {0,1,..N-1}. - size1 - integer length of output subgrid du - M - number of NU pts in subproblem - kx (length M) - are rescaled NU source locations, should lie in - [off1+ns/2,off1+size1-1-ns/2] so as kernels stay in bounds - dd (length M complex, interleaved) - source strengths - Outputs: - du (length size1 complex, interleaved) - preallocated uniform subgrid array - - The reason periodic wrapping is avoided in subproblems is speed: avoids - conditionals, indirection (pointers), and integer mod. Originally 2017. - Kernel eval mods by Ludvig al Klinteberg. - Fixed so rounding to integer grid consistent w/ get_subgrid, prevents - chance of segfault when epsmach*N1>O(1), assuming max() and ceil() commute. - This needed off1 as extra arg. AHB 11/30/20. -*/ -{ - int ns = opts.nspread; // a.k.a. w - FLT ns2 = (FLT)ns / 2; // half spread width - for (BIGINT i = 0; i < 2 * size1; ++i) // zero output - du[i] = 0.0; - FLT kernel_args[MAX_NSPREAD]; - FLT ker[MAX_NSPREAD]; - for (BIGINT i = 0; i < M; i++) { // loop over NU pts - FLT re0 = dd[2 * i]; - FLT im0 = dd[2 * i + 1]; +template +FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( + const BIGINT off1, const BIGINT size1, FLT *FINUFFT_RESTRICT du, const BIGINT M, + const FLT *const kx, const FLT *const dd, const finufft_spread_opts &opts) noexcept { + /* 1D spreader from nonuniform to uniform subproblem grid, without wrapping. + Inputs: + off1 - integer offset of left end of du subgrid from that of overall fine + periodized output grid {0,1,...N-1}. + size1 - integer length of output subgrid du + M - number of NU pts in subproblem + kx (length M) - are rescaled NU source locations, should lie in + [off1+ns/2,off1+size1-1-ns/2] so as kernels stay in bounds + dd (length M complex, interleaved) - source strengths + Outputs: + du (length size1 complex, interleaved) - preallocated uniform subgrid array + + The reason periodic wrapping is avoided in subproblems is speed: avoids + conditionals, indirection (pointers), and integer mod. Originally 2017. + Kernel eval mods by Ludvig al Klinteberg. + Fixed so rounding to integer grid consistent w/ get_subgrid, prevents + chance of segfault when epsmach*N1>O(1), assuming max() and ceil() commute. + This needed off1 as extra arg. AHB 11/30/20. + Vectorized using xsimd by M. Barbone 06/24. + */ + using batch_t = PaddedSIMD; + using arch_t = typename batch_t::arch_type; + static constexpr auto padding = get_padding(); + static constexpr auto alignment = batch_t::arch_type::alignment(); + static constexpr auto avx_size = batch_t::size; + static constexpr auto ns2 = ns * FLT(0.5); // half spread width + // something weird here. Reversing ker{0} and std fill causes ker + // to be zeroed inside the loop GCC uses AVX, clang AVX2 + std::fill(du, du + 2 * size1, 0); // zero output + // no padding needed if MAX_NSPREAD is 16 + // the largest read is 16 floats with avx512 + // if larger instructions will be available or half precision is used, this should be + // padded + for (uint64_t i{0}; i < M; i++) { // loop over NU pts + // lamda here to return a dd_pt that is const + // should not make a difference in performance + // but is a hint to the compiler that after the lambda + // dd_pt is not modified and can be kept as is in a register + // given (re, im) in this case dd[i*2] and dd[i*2+1] + // this function returns a simd register of size avx_size + // initialized as follows: + // +-----------------------+ + // |re|im|re|im|re|im|re|im| + // +-----------------------+ + const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... - BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start index - FLT x1 = (FLT)i1 - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding + const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index + // FLT(i1) has different semantics and results an extra cast + auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly // kernel evaluation will fall outside their designed domains, >>1 errors. // This can only happen if the overall error would be O(1) anyway. Clip x1?? if (x1 < -ns2) x1 = -ns2; if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** - if (opts.kerevalmeth == 0) { // faster Horner poly method - set_kernel_args(kernel_args, x1, opts); - evaluate_kernel_vector(ker, kernel_args, opts, ns); - } else - eval_kernel_vec_Horner(ker, x1, ns, opts); - BIGINT j = i1 - off1; // offset rel to subgrid, starts the output indices - // critical inner loop: - for (int dx = 0; dx < ns; ++dx) { - FLT k = ker[dx]; - du[2 * j] += re0 * k; - du[2 * j + 1] += im0 * k; - ++j; + alignas(alignment) const auto ker = ker_eval(opts, x1); + const auto j = i1 - off1; // offset rel to subgrid, starts the output indices + auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize + // du is padded, so we can use SIMD even if we write more than ns values in du + // ker0 is also padded. + // regular_part is the largest multiple of 2*ns minus the remainder modulo + // (2*avx_size). This allows to save one load. + // see below for the details. + // adding padding to guarantee that all the elments are computed + // this trick only works when avx_size is a power of 2 + // avx_size*2 is guaranteed to be a power of 2, trivially + static constexpr auto regular_part = (2 * ns + padding) & (-(2 * avx_size)); + // this loop increment is 2*avx_size by design + // it allows to save one load this way + // this does for each element e of the subgrid, x1 defined above and pt the NU point + // the following: e += exp(beta.sqrt(1 - (2*x1/n_s)^2))*pt + // NOTE: x1 is translated accordingly, please see the ES method for more + // using uint8_t in loops to favor unrolling. + // Most compilers limit the unrolling to 255, uint8_t is at most 255 + for (uint8_t dx{0}; dx < regular_part; dx += 2 * avx_size) { + // read ker01 which is avx_size wide from ker + // ker01 looks like this: + // +-----------------------+ + // |y0|y1|y2|y3|y4|y5|y6|y7| + // +-----------------------+ + const auto ker01 = batch_t::load_aligned(ker.data() + dx / 2); + // read 2*SIMD vectors from the subproblem grid + const auto du_pt0 = batch_t::load_unaligned(trg + dx); + const auto du_pt1 = batch_t::load_unaligned(trg + dx + avx_size); + // swizzle is faster than zip_lo(ker01, ker01) and zip_hi(ker01, ker01) + // swizzle in this case is equivalent to zip_lo and zip_hi respectively + const auto ker0 = xsimd::swizzle(ker01, zip_low_index); + // ker 0 looks like this now: + // +-----------------------+ + // |y0|y0|y1|y1|y2|y2|y3|y3| + // +-----------------------+ + const auto ker1 = xsimd::swizzle(ker01, zip_hi_index); + // ker 1 looks like this now: + // +-----------------------+ + // |y4|y4|y5|y5|y6|y6|y7|y7| + // +-----------------------+ + // same as before each element of the subproblem grid is multiplied by the + // corresponding element of the kernel since dd_pt is re|im interleaves res0 is also + // correctly re|im interleaved + // doing this for two SIMD vectors at once allows to fully utilize ker01 instead of + // wasting the higher half + const auto res0 = xsimd::fma(ker0, dd_pt, du_pt0); + const auto res1 = xsimd::fma(ker1, dd_pt, du_pt1); + res0.store_unaligned(trg + dx); + res1.store_unaligned(trg + dx + avx_size); + } + // sanity check at compile time that all the elements are computed + static_assert(regular_part + avx_size >= 2 * ns); + // case where the 2*ns is not a multiple of 2*avx_size + // checking 2*ns instead of 2*ns+padding as we do not need to compute useless zeros... + if constexpr (regular_part < 2 * ns) { + // here we need to load the last kernel values, + // but we can avoid computing extra padding + // also this padding will result in out-of-bounds access to trg + const auto ker01 = batch_t::load_unaligned(ker.data() + (regular_part / 2)); + const auto du_pt = batch_t::load_unaligned(trg + regular_part); + const auto ker0 = xsimd::swizzle(ker01, zip_low_index); + const auto res = xsimd::fma(ker0, dd_pt, du_pt); + res.store_unaligned(trg + regular_part); } } } -void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, FLT *du, - BIGINT M, FLT *kx, FLT *ky, FLT *dd, - const finufft_spread_opts &opts) +template +static void spread_subproblem_1d_dispatch( + const BIGINT off1, const BIGINT size1, FLT *FINUFFT_RESTRICT du, const BIGINT M, + const FLT *kx, const FLT *dd, const finufft_spread_opts &opts) noexcept { + /* this is a dispatch function that will call the correct kernel based on the ns + it recursively iterates from MAX_NSPREAD to MIN_NSPREAD + it generates the following code: + if (ns == MAX_NSPREAD) { + if (opts.kerevalmeth) { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); + } else { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); + } + if (ns == MAX_NSPREAD-1) { + if (opts.kerevalmeth) { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); + } else { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, + dd, opts); + } + } + ... + NOTE: using a big MAX_NSPREAD will generate a lot of code + if MAX_NSPREAD gets too large it will crash the compiler with a compile time + stack overflow. Older compiler will just throw an internal error without + providing any useful information on the error. + This is a known issue with template metaprogramming. + If you increased MAX_NSPREAD and the code does not compile, try reducing it. + */ + static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); + else { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, + opts); + } + } else { + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + } else { + return spread_subproblem_1d_kernel(off1, size1, du, M, kx, dd, opts); + } + } else { + return spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); + } + } +} + +void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du, BIGINT M, FLT *kx, FLT *dd, + const finufft_spread_opts &opts) noexcept /* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. dd (size M complex) are complex source strengths du (size size1*size2) is complex uniform output array - */ + For algoritmic details see spread_subproblem_1d_kernel. +*/ +{ + spread_subproblem_1d_dispatch(off1, size1, du, M, kx, dd, opts); +} + +template +FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( + const BIGINT off1, const BIGINT off2, const BIGINT size1, const BIGINT size2, + FLT *FINUFFT_RESTRICT du, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *dd, + const finufft_spread_opts &opts) noexcept +/* spreader from dd (NU) to du (uniform) in 2D without wrapping. + See above docs/notes for spread_subproblem_2d. + kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. + dd (size M complex) are complex source strengths + du (size size1*size2) is complex uniform output array + For algoritmic details see spread_subproblem_1d_kernel. +*/ { - int ns = opts.nspread; - FLT ns2 = (FLT)ns / 2; // half spread width - for (BIGINT i = 0; i < 2 * size1 * size2; ++i) du[i] = 0.0; - FLT kernel_args[2 * MAX_NSPREAD]; + using batch_t = PaddedSIMD; + using arch_t = typename batch_t::arch_type; + static constexpr auto padding = get_padding(); + static constexpr auto avx_size = batch_t::size; + static constexpr auto alignment = batch_t::arch_type::alignment(); // Kernel values stored in consecutive memory. This allows us to compute - // values in two directions in a single kernel evaluation call. - FLT kernel_values[2 * MAX_NSPREAD]; - FLT *ker1 = kernel_values; - FLT *ker2 = kernel_values + ns; - for (BIGINT i = 0; i < M; i++) { // loop over NU pts - FLT re0 = dd[2 * i]; - FLT im0 = dd[2 * i + 1]; + // values in all three directions in a single kernel evaluation call. + static constexpr auto ns2 = ns * FLT(0.5); // half spread width + std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts + const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... - BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start indices - BIGINT i2 = (BIGINT)std::ceil(ky[i] - ns2); - FLT x1 = (FLT)i1 - kx[i]; - FLT x2 = (FLT)i2 - ky[i]; - if (opts.kerevalmeth == 0) { // faster Horner poly method - set_kernel_args(kernel_args, x1, opts); - set_kernel_args(kernel_args + ns, x2, opts); - evaluate_kernel_vector(kernel_values, kernel_args, opts, 2 * ns); - } else { - // due to ns being padded up to next multiple of 4 in the - // eval_kernel_vec_Horner and writing zeros out to this padded size, these - // must occur in the order x,y... - eval_kernel_vec_Horner(ker1, x1, ns, opts); - eval_kernel_vec_Horner(ker2, x2, ns, opts); - } + const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices + const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); + const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; + const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; + alignas(alignment) const auto kernel_values = + ker_eval(opts, x1, x2); + alignas(alignment) auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); + alignas(alignment) auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop - FLT ker1val[2 * MAX_NSPREAD]; // here 2* is because of complex - for (int i = 0; i < ns; i++) { - ker1val[2 * i] = re0 * ker1[i]; - ker1val[2 * i + 1] = im0 * ker1[i]; + // here 2* is because of complex + static constexpr uint8_t batches = (2 * ns + padding) / avx_size; + static_assert(batches > 0, "batches must be greater than 0"); + batch_t ker1val_batches[batches]; + + for (u_int8_t i = 0; i < (batches & ~1); i += 2) { + const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); + const auto ker00 = xsimd::swizzle(ker01, zip_low_index); + const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); + ker1val_batches[i] = ker00 * dd_pt; + ker1val_batches[i + 1] = ker11 * dd_pt; + } + if constexpr (batches % 2) { + const auto ker1_batch = + batch_t::load_unaligned(ker1 + (batches - 1) * avx_size / 2); + const auto res = xsimd::swizzle(ker1_batch, zip_low_index) * dd_pt; + ker1val_batches[batches - 1] = res; } // critical inner loop: - for (int dy = 0; dy < ns; ++dy) { - BIGINT j = size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid - FLT kerval = ker2[dy]; - FLT *trg = du + 2 * j; - for (int dx = 0; dx < 2 * ns; ++dx) { - trg[dx] += kerval * ker1val[dx]; + for (auto dy = 0; dy < ns; ++dy) { + const auto j = size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid + auto *FINUFFT_RESTRICT trg = du + 2 * j; + const batch_t kerval_batch(ker2[dy]); + for (u_int8_t i = 0; i < batches; ++i) { + const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); + const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); + result.store_unaligned(trg + i * avx_size); } } } } -void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, - BIGINT size2, BIGINT size3, FLT *du, BIGINT M, FLT *kx, FLT *ky, - FLT *kz, FLT *dd, const finufft_spread_opts &opts) -/* spreader from dd (NU) to du (uniform) in 3D without wrapping. +template +void spread_subproblem_2d_dispatch( + const BIGINT off1, const BIGINT off2, const BIGINT size1, const BIGINT size2, + FLT *FINUFFT_RESTRICT du, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *dd, + const finufft_spread_opts &opts) { + static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, + M, kx, ky, dd, opts); + else { + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, + M, kx, ky, dd, opts); + } + } else { + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, + ky, dd, opts); + } else { + return spread_subproblem_2d_kernel(off1, off2, size1, size2, du, M, kx, + ky, dd, opts); + } + } else { + return spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, + ky, dd, opts); + } + } +} + +void spread_subproblem_2d(const BIGINT off1, const BIGINT off2, const BIGINT size1, + const BIGINT size2, FLT *FINUFFT_RESTRICT du, const BIGINT M, + const FLT *kx, const FLT *ky, const FLT *dd, + const finufft_spread_opts &opts) noexcept +/* spreader from dd (NU) to du (uniform) in 2D without wrapping. See above docs/notes for spread_subproblem_2d. - kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim. + kx,ky (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in both dims. dd (size M complex) are complex source strengths - du (size size1*size2*size3) is uniform complex output array - */ + du (size size1*size2) is complex uniform output array + For algoritmic details see spread_subproblem_1d_kernel. +*/ { - int ns = opts.nspread; - FLT ns2 = (FLT)ns / 2; // half spread width - for (BIGINT i = 0; i < 2 * size1 * size2 * size3; ++i) du[i] = 0.0; - FLT kernel_args[3 * MAX_NSPREAD]; - // Kernel values stored in consecutive memory. This allows us to compute - // values in all three directions in a single kernel evaluation call. - FLT kernel_values[3 * MAX_NSPREAD]; - FLT *ker1 = kernel_values; - FLT *ker2 = kernel_values + ns; - FLT *ker3 = kernel_values + 2 * ns; - for (BIGINT i = 0; i < M; i++) { // loop over NU pts - FLT re0 = dd[2 * i]; - FLT im0 = dd[2 * i + 1]; + spread_subproblem_2d_dispatch(off1, off2, size1, size2, du, M, kx, ky, dd, + opts); +} + +template +FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( + const BIGINT off1, const BIGINT off2, const BIGINT off3, const BIGINT size1, + const BIGINT size2, const BIGINT size3, FLT *FINUFFT_RESTRICT du, const BIGINT M, + const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, + const finufft_spread_opts &opts) noexcept { + using batch_t = PaddedSIMD; + using arch_t = typename batch_t::arch_type; + static constexpr auto padding = get_padding(); + static constexpr auto avx_size = batch_t::size; + static constexpr auto alignment = batch_t::arch_type::alignment(); + static constexpr auto ns2 = ns * FLT(0.5); // half spread width + std::fill(du, du + 2 * size1 * size2 * size3, 0); + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts + const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... - BIGINT i1 = (BIGINT)std::ceil(kx[i] - ns2); // fine grid start indices - BIGINT i2 = (BIGINT)std::ceil(ky[i] - ns2); - BIGINT i3 = (BIGINT)std::ceil(kz[i] - ns2); - FLT x1 = (FLT)i1 - kx[i]; - FLT x2 = (FLT)i2 - ky[i]; - FLT x3 = (FLT)i3 - kz[i]; - if (opts.kerevalmeth == 0) { // faster Horner poly method - set_kernel_args(kernel_args, x1, opts); - set_kernel_args(kernel_args + ns, x2, opts); - set_kernel_args(kernel_args + 2 * ns, x3, opts); - evaluate_kernel_vector(kernel_values, kernel_args, opts, 3 * ns); - } else { - // due to ns being padded up to next multiple of 4 in the - // eval_kernel_vec_Horner and writing zeros out to this padded size, these - // must occur in the order x,y,z... - eval_kernel_vec_Horner(ker1, x1, ns, opts); - eval_kernel_vec_Horner(ker2, x2, ns, opts); - eval_kernel_vec_Horner(ker3, x3, ns, opts); - } + const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices + const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); + const auto i3 = (BIGINT)std::ceil(kz[pt] - ns2); + const auto x1 = std::ceil(kx[pt] - ns2) - kx[pt]; + const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; + const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; + + alignas(alignment) const auto kernel_values = + ker_eval(opts, x1, x2, x3); + auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); + auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop - FLT ker1val[2 * MAX_NSPREAD]; // here 2* is because of complex - for (int i = 0; i < ns; i++) { - ker1val[2 * i] = re0 * ker1[i]; - ker1val[2 * i + 1] = im0 * ker1[i]; + // here 2* is because of complex + // Batches is the number of SIMD iterations needed to compute all the elements + static constexpr uint8_t batches = (2 * ns + padding) / avx_size; + static_assert(batches > 0, "batches must be greater than 0"); + batch_t ker1val_batches[batches]; + // Iterate over batches but in case the number of batches is odd + // we need to handle the last batch separately + // to the & ~1 is to ensure that we do not iterate over the last batch if it is odd + // as it sets the last bit to 0 + for (u_int8_t i = 0; i < (batches & ~1); i += 2) { + const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); + const auto ker00 = xsimd::swizzle(ker01, zip_low_index); + const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); + ker1val_batches[i] = ker00 * dd_pt; + ker1val_batches[i + 1] = ker11 * dd_pt; + } + + // (at compile time) check if the number of batches is odd + // if it is we need to handle the last batch separately + if constexpr (batches % 2) { + const auto ker1_batch = + batch_t::load_unaligned(ker1 + (batches - 1) * avx_size / 2); + const auto res = xsimd::swizzle(ker1_batch, zip_low_index) * dd_pt; + ker1val_batches[batches - 1] = res; } // critical inner loop: - for (int dz = 0; dz < ns; ++dz) { - BIGINT oz = size1 * size2 * (i3 - off3 + dz); // offset due to z - for (int dy = 0; dy < ns; ++dy) { - BIGINT j = oz + size1 * (i2 - off2 + dy) + i1 - off1; // should be in - // subgrid - FLT kerval = ker2[dy] * ker3[dz]; - FLT *trg = du + 2 * j; - for (int dx = 0; dx < 2 * ns; ++dx) { - trg[dx] += kerval * ker1val[dx]; + for (u_int8_t dz{0}; dz < ns; ++dz) { + const auto oz = size1 * size2 * (i3 - off3 + dz); // offset due to z + for (u_int8_t dy{0}; dy < ns; ++dy) { + const auto j = oz + size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid + auto *FINUFFT_RESTRICT trg = du + 2 * j; + const auto kerval = ker2[dy] * ker3[dz]; + const batch_t kerval_batch(kerval); + for (u_int8_t i{0}; i < batches; ++i) { + const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); + const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); + result.store_unaligned(trg + i * avx_size); } } } } } -void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT size1, - BIGINT size2, BIGINT size3, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, FLT *du0) -/* Add a large subgrid (du0) to output grid (data_uniform), - with periodic wrapping to N1,N2,N3 box. - offset1,2,3 give the offset of the subgrid from the lowest corner of output. - size1,2,3 give the size of subgrid. - Works in all dims. Not thread-safe and must be called inside omp critical. - Barnett 3/27/18 made separate routine, tried to speed up inner loop. -*/ -{ - std::vector o2(size2), o3(size3); - BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z... - for (int i = 0; i < size2; ++i) { - if (y < 0) y += N2; - if (y >= N2) y -= N2; - o2[i] = y++; - } - for (int i = 0; i < size3; ++i) { - if (z < 0) z += N3; - if (z >= N3) z -= N3; - o3[i] = z++; - } - BIGINT nlo = (offset1 < 0) ? -offset1 : 0; // # wrapping below in x - BIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x - // this triple loop works in all dims - for (int dz = 0; dz < size3; dz++) { // use ptr lists in each axis - BIGINT oz = N1 * N2 * o3[dz]; // offset due to z (0 in <3D) - for (int dy = 0; dy < size2; dy++) { - BIGINT oy = oz + N1 * o2[dy]; // off due to y & z (0 in 1D) - FLT *out = data_uniform + 2 * oy; - FLT *in = du0 + 2 * size1 * (dy + size2 * dz); // ptr to subgrid array - BIGINT o = 2 * (offset1 + N1); // 1d offset for output - for (int j = 0; j < 2 * nlo; j++) // j is really dx/2 (since re,im parts) - out[j + o] += in[j]; - o = 2 * offset1; - for (int j = 2 * nlo; j < 2 * (size1 - nhi); j++) out[j + o] += in[j]; - o = 2 * (offset1 - N1); - for (int j = 2 * (size1 - nhi); j < 2 * size1; j++) out[j + o] += in[j]; +template +void spread_subproblem_3d_dispatch( + BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, BIGINT size2, BIGINT size3, + FLT *du, BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, + const finufft_spread_opts &opts) noexcept { + static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); + else { + return spread_subproblem_3d_kernel( + off1, off2, off3, size1, size2, size3, du, M, kx, ky, kz, dd, opts); + } + } else { + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, + size3, du, M, kx, ky, kz, dd, opts); + } else { + return spread_subproblem_3d_kernel(off1, off2, off3, size1, size2, + size3, du, M, kx, ky, kz, dd, opts); + } + } else { + return spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, + du, M, kx, ky, kz, dd, opts); } } } -void add_wrapped_subgrid_thread_safe(BIGINT offset1, BIGINT offset2, BIGINT offset3, - BIGINT size1, BIGINT size2, BIGINT size3, BIGINT N1, - BIGINT N2, BIGINT N3, FLT *data_uniform, FLT *du0) +void spread_subproblem_3d(BIGINT off1, BIGINT off2, BIGINT off3, BIGINT size1, + BIGINT size2, BIGINT size3, FLT *du, BIGINT M, FLT *kx, FLT *ky, + FLT *kz, FLT *dd, const finufft_spread_opts &opts) noexcept +/* spreader from dd (NU) to du (uniform) in 3D without wrapping. +See above docs/notes for spread_subproblem_2d. +kx,ky,kz (size M) are NU locations in [off+ns/2,off+size-1-ns/2] in each dim. +dd (size M complex) are complex source strengths +du (size size1*size2*size3) is uniform complex output array +*/ +{ + spread_subproblem_3d_dispatch(off1, off2, off3, size1, size2, size3, du, M, + kx, ky, kz, dd, opts); +} + +template +void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, + BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, + BIGINT N1, BIGINT N2, BIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const FLT *const du0) /* Add a large subgrid (du0) to output grid (data_uniform), with periodic wrapping to N1,N2,N3 box. offset1,2,3 give the offset of the subgrid from the lowest corner of output. - size1,2,3 give the size of subgrid. + padded_size1,2,3 give the size of subgrid. Works in all dims. Thread-safe variant of the above routine, using atomic writes (R Blackwell, Nov 2020). + Merged the thread_safe and the not thread_safe version of the function into one + (M. Barbone 06/24). */ { std::vector o2(size2), o3(size3); + static auto accumulate = [](FLT &a, FLT b) { + if constexpr (thread_safe) { // NOLINT(*-branch-clone) +#pragma omp atomic + a += b; + } else { + a += b; + } + }; + BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z... for (int i = 0; i < size2; ++i) { if (y < 0) y += N2; @@ -1163,26 +1484,23 @@ void add_wrapped_subgrid_thread_safe(BIGINT offset1, BIGINT offset2, BIGINT offs BIGINT nlo = (offset1 < 0) ? -offset1 : 0; // # wrapping below in x BIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x // this triple loop works in all dims - for (int dz = 0; dz < size3; dz++) { // use ptr lists in each axis - BIGINT oz = N1 * N2 * o3[dz]; // offset due to z (0 in <3D) + for (int dz = 0; dz < size3; dz++) { // use ptr lists in each axis + const auto oz = N1 * N2 * o3[dz]; // offset due to z (0 in <3D) for (int dy = 0; dy < size2; dy++) { - BIGINT oy = oz + N1 * o2[dy]; // off due to y & z (0 in 1D) - FLT *out = data_uniform + 2 * oy; - FLT *in = du0 + 2 * size1 * (dy + size2 * dz); // ptr to subgrid array - BIGINT o = 2 * (offset1 + N1); // 1d offset for output - for (int j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts) -#pragma omp atomic - out[j + o] += in[j]; + const auto oy = N1 * o2[dy] + oz; // off due to y & z (0 in 1D) + auto *FINUFFT_RESTRICT out = data_uniform + 2 * oy; + const auto in = du0 + 2 * padded_size1 * (dy + size2 * dz); // ptr to subgrid array + auto o = 2 * (offset1 + N1); // 1d offset for output + for (auto j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts) + accumulate(out[j + o], in[j]); } o = 2 * offset1; - for (int j = 2 * nlo; j < 2 * (size1 - nhi); j++) { -#pragma omp atomic - out[j + o] += in[j]; + for (auto j = 2 * nlo; j < 2 * (size1 - nhi); j++) { + accumulate(out[j + o], in[j]); } o = 2 * (offset1 - N1); - for (int j = 2 * (size1 - nhi); j < 2 * size1; j++) { -#pragma omp atomic - out[j + o] += in[j]; + for (auto j = 2 * (size1 - nhi); j < 2 * size1; j++) { + accumulate(out[j + o], in[j]); } } } @@ -1219,23 +1537,27 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIG * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). */ { - bool isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) + const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - BIGINT nbins1 = N1 / bin_size_x + 1, nbins2, nbins3; - nbins2 = isky ? N2 / bin_size_y + 1 : 1; - nbins3 = iskz ? N3 / bin_size_z + 1 : 1; - BIGINT nbins = nbins1 * nbins2 * nbins3; - - std::vector counts(nbins, 0); // count how many pts in each bin - for (BIGINT i = 0; i < M; i++) { + const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins = nbins1 * nbins2 * nbins3; + const auto inv_bin_size_x = FLT(1.0 / bin_size_x); + const auto inv_bin_size_y = FLT(1.0 / bin_size_y); + const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + // count how many pts in each bin + std::vector counts(nbins, 0); + + for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed - BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; - BIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3); - counts[bin]++; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ++counts[bin]; } // compute the offsets directly in the counts array (no offset array) @@ -1246,14 +1568,14 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIG current_offset += tmp; } // (counts now contains the index offsets for each bin) - for (BIGINT i = 0; i < M; i++) { + for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) - BIGINT i1 = fold_rescale(kx[i], N1) / bin_size_x, i2 = 0, i3 = 0; - if (isky) i2 = fold_rescale(ky[i], N2) / bin_size_y; - if (iskz) i3 = fold_rescale(kz[i], N3) / bin_size_z; - BIGINT bin = i1 + nbins1 * (i2 + nbins2 * i3); - ret[counts[bin]] = i; // fill the inverse map on the fly - ++counts[bin]; // update the offsets + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); + ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly + ++counts[bin]; // update the offsets } } @@ -1327,9 +1649,9 @@ void bin_sort_multithread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIGI } } -void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size1, - BIGINT &size2, BIGINT &size3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - int ns, int ndims) +void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &padded_size1, + BIGINT &size1, BIGINT &size2, BIGINT &size3, BIGINT M, FLT *kx, FLT *ky, + FLT *kz, int ns, int ndims) /* Writes out the integer offsets and sizes of a "subgrid" (cuboid subset of Z^ndims) large enough to enclose all of the nonuniform points with (non-periodic) padding of half the kernel width ns to each side in @@ -1346,7 +1668,7 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size Outputs: offset1,2,3 - left-most coord of cuboid in each dimension (up to ndims) - size1,2,3 - size of cuboid in each dimension. + padded_size1,2,3 - size of cuboid in each dimension. Thus the right-most coord of cuboid is offset+size-1. Returns offset 0 and size 1 for each unused dimension (ie when ndims<3); this is required by the calling code. @@ -1356,7 +1678,7 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size ndims=1, M=2, kx[0]=0.2, ks[1]=4.9, ns=3 outputs: offset1=-1 (since kx[0] spreads to {-1,0,1}, and -1 is the min) - size1=8 (since kx[1] spreads to {4,5,6}, so subgrid is {-1,..,6} + padded_size1=8 (since kx[1] spreads to {4,5,6}, so subgrid is {-1,..,6} hence 8 grid points). Notes: 1) Works in all dims 1,2,3. @@ -1376,8 +1698,9 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size FLT ns2 = (FLT)ns / 2; FLT min_kx, max_kx; // 1st (x) dimension: get min/max of nonuniform points arrayrange(M, kx, &min_kx, &max_kx); - offset1 = (BIGINT)std::ceil(min_kx - ns2); // min index touched by kernel - size1 = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first! + offset1 = (BIGINT)std::ceil(min_kx - ns2); // min index touched by kernel + size1 = (BIGINT)std::ceil(max_kx - ns2) - offset1 + ns; // int(ceil) first! + padded_size1 = size1 + get_padding(2 * ns) / 2; if (ndims > 1) { FLT min_ky, max_ky; // 2nd (y) dimension: get min/max of nonuniform points arrayrange(M, ky, &min_ky, &max_ky); @@ -1404,10 +1727,128 @@ void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, BIGINT &size Martin Reinecke, 8.5.2024 used floor to speedup the function and removed the range limitation Marco Barbone, 8.5.2024 Changed it from a Macro to an inline function */ -FINUFFT_ALWAYS_INLINE FLT fold_rescale(const FLT x, const BIGINT N) noexcept { +FLT fold_rescale(const FLT x, const BIGINT N) noexcept { static constexpr const FLT x2pi = FLT(M_1_2PI); const FLT result = x * x2pi + FLT(0.5); return (result - floor(result)) * FLT(N); } -} // namespace spreadinterp -} // namespace finufft + +template +auto ker_eval(const finufft_spread_opts &opts, const V... elems) noexcept { + /* Utility function that allows to move the kernel evaluation outside the spreader for + clarity Inputs are: ns = kernel width kerevalmeth = kernel evaluation method T = + (single or double precision) type of the kernel batch_t = batch type for Horner + vectorization (default is the optimal batch size) finufft_spread_opts as Horner needs + the oversampling factor elems = kernel arguments examples usage is ker_eval(opts, x, y, z) // for 3D or ker_eval(opts, x, y) // for + 2D or ker_eval(opts, x) // for 1D + */ + alignas(batch_t::arch_type::alignment()) std::array + ker{0}; + const std::array inputs{elems...}; + // compile time loop, no performance overhead + for (auto i = 0; i < sizeof...(elems); ++i) { + // compile time branch no performance overhead + if constexpr (kerevalmeth == 1) { + eval_kernel_vec_Horner(ker.data() + (i * MAX_NSPREAD), inputs[i], + opts); + } + if constexpr (kerevalmeth == 0) { + alignas(batch_t::arch_type::alignment()) std::array kernel_args{}; + set_kernel_args(kernel_args.data(), inputs[i], opts); + evaluate_kernel_vector(ker.data() + (i * MAX_NSPREAD), kernel_args.data(), opts, + ns); + } + } + return ker; +} + +namespace { +template +constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { + // utility function to generate a sequence of a, b interleaved as function arguments + return T(((Is % 2 == 0) ? a : b)...); +} + +template +constexpr auto initialize_complex_batch(V a, V b) noexcept { + // populates a SIMD register with a and b interleaved + // for example: + // +-------------------------------+ + // | a | b | a | b | a | b | a | b | + // +-------------------------------+ + // it uses index_sequence to generate the sequence of a, b at compile time + return generate_sequence_impl(a, b, std::make_index_sequence{}); +} + +// Below there is some template metaprogramming magic to find the best SIMD type +// for the given number of elements. The code is based on the xsimd library + +// this finds the largest SIMD instruction set that can handle N elements +// void otherwise -> compile error +template constexpr auto BestSIMDHelper() { + if constexpr (N % K == 0) { // returns void in the worst case + return xsimd::make_sized_batch{}; + } else { + return BestSIMDHelper> 1)>(); + } +} + +template constexpr uint16_t min_batch_size() { + if constexpr (std::is_void_v>) { + return min_batch_size(); + } else { + return N; + } +}; + +template constexpr auto find_optimal_batch_size() { + uint16_t optimal_batch_size = min_batch_size(); + uint16_t min_iterations = (N + optimal_batch_size - 1) / optimal_batch_size; + for (uint16_t batch_size = optimal_batch_size; + batch_size <= xsimd::batch::size; + batch_size *= 2) { + uint16_t iterations = (N + batch_size - 1) / batch_size; + if (iterations < min_iterations) { + min_iterations = iterations; + optimal_batch_size = batch_size; + } + } + return optimal_batch_size; +} + +template constexpr auto GetPaddedSIMDSize() { + return xsimd::make_sized_batch()>::type::size; +} + +template constexpr auto get_padding() { + constexpr uint16_t width = GetPaddedSIMDSize(); + return ns % width == 0 ? 0 : width - (ns % width); +} + +template constexpr auto get_padding_helper(uint16_t runtime_ns) { + if constexpr (ns < 2) { + return 0; + } else { + if (runtime_ns == ns) { + return get_padding(); + } else { + return get_padding_helper(runtime_ns); + } + } +} + +template uint16_t get_padding(uint8_t ns) { + return get_padding_helper(ns); +} + +struct zip_low { + static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } +}; +struct zip_hi { + static constexpr unsigned get(unsigned index, unsigned size) { + return (size + index) / 2; + } +}; +} // namespace +} // namespace finufft::spreadinterp From 623d591eb8a9cab9a8900036a6df612468b46842 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 6 Jun 2024 18:19:05 -0400 Subject: [PATCH 03/35] restored untouched files --- src/finufft.cpp | 166 +++++++++++++++++++++++++----------------------- 1 file changed, 87 insertions(+), 79 deletions(-) diff --git a/src/finufft.cpp b/src/finufft.cpp index 8b9c6006b..03c1d9ac6 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -38,41 +38,41 @@ using namespace finufft::quadrature; Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: TYPE 1: - The type 1 NUFFT proceeds in three main steps: - 1) spread data to oversampled regular mesh using kernel. - 2) compute FFT on uniform mesh - 3) deconvolve by division of each Fourier mode independently by the kernel - Fourier series coeffs (not merely FFT of kernel), shuffle to output. - The kernel coeffs are precomputed in what is called step 0 in the code. + The type 1 NUFFT proceeds in three main steps: + 1) spread data to oversampled regular mesh using kernel. + 2) compute FFT on uniform mesh + 3) deconvolve by division of each Fourier mode independently by the kernel + Fourier series coeffs (not merely FFT of kernel), shuffle to output. + The kernel coeffs are precomputed in what is called step 0 in the code. Written with FFTW style complex arrays. Step 3a internally uses CPX, and Step 3b internally uses real arithmetic and FFTW style complex. TYPE 2: - The type 2 algorithm proceeds in three main steps: - 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff - 2) compute inverse FFT on uniform fine grid - 3) spread (dir=2, ie interpolate) data to regular mesh - The kernel coeffs are precomputed in what is called step 0 in the code. + The type 2 algorithm proceeds in three main steps: + 1) deconvolve (amplify) each Fourier mode, dividing by kernel Fourier coeff + 2) compute inverse FFT on uniform fine grid + 3) spread (dir=2, ie interpolate) data to regular mesh + The kernel coeffs are precomputed in what is called step 0 in the code. Written with FFTW style complex arrays. Step 0 internally uses CPX, and Step 1 internally uses real arithmetic and FFTW style complex. TYPE 3: - The type 3 algorithm is basically a type 2 (which is implemented precisely - as call to type 2) replacing the middle FFT (Step 2) of a type 1. - Beyond this, the new twists are: - i) nf1, number of upsampled points for the type-1, depends on the product - of interval widths containing input and output points (X*S). - ii) The deconvolve (post-amplify) step is division by the Fourier transform - of the scaled kernel, evaluated on the *nonuniform* output frequency - grid; this is done by direct approximation of the Fourier integral - using quadrature of the kernel function times exponentials. - iii) Shifts in x (real) and s (Fourier) are done to minimize the interval - half-widths X and S, hence nf1. + The type 3 algorithm is basically a type 2 (which is implemented precisely + as call to type 2) replacing the middle FFT (Step 2) of a type 1. + Beyond this, the new twists are: + i) nf1, number of upsampled points for the type-1, depends on the product + of interval widths containing input and output points (X*S). + ii) The deconvolve (post-amplify) step is division by the Fourier transform + of the scaled kernel, evaluated on the *nonuniform* output frequency + grid; this is done by direct approximation of the Fourier integral + using quadrature of the kernel function times exponentials. + iii) Shifts in x (real) and s (Fourier) are done to minimize the interval + half-widths X and S, hence nf1. No references to FFTW are needed here. CPX arithmetic is used. MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): - maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so - this is good only for small problems. + maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so + this is good only for small problems. Design notes for guru interface implementation: @@ -111,10 +111,10 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI *nf = next235even(*nf); // expensive at huge nf return 0; } else { - fprintf( - stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n", - __func__, (double)*nf, (double)MAX_NF); + fprintf(stderr, + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " + "malloc\n", + __func__, (double)*nf, (double)MAX_NF); return FINUFFT_ERR_MAXNALLOC; } } @@ -198,8 +198,8 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) Outputs: fwkerhalf - real Fourier series coeffs from indices 0 to nf/2 inclusive, - divided by h = 2pi/n. - (should be allocated for at least nf/2+1 FLTs) + divided by h = 2pi/n. + (should be allocated for at least nf/2+1 FLTs) Compare onedim_dct_kernel which has same interface, but computes DFT of sampled kernel, not quite the same object. @@ -253,8 +253,8 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts Inputs: nk - number of freqs k - frequencies, dual to the kernel's natural argument, ie exp(i.k.z) - Note, z is in grid-point units, and k values must be in [-pi, pi) for - accuracy. + Note, z is in grid-point units, and k values must be in [-pi, pi) for + accuracy. opts - spreading opts object, needed to eval kernel (must be already set up) Outputs: @@ -291,11 +291,11 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (from -N/2 up to N/2-1) - 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). + 1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1). fk is size-ms FLT complex array (2*ms FLTs alternating re,im parts) fw is a FFTW style complex array, ie FLT [nf1][2], essentially FLTs - alternating re,im parts. + alternating re,im parts. ker is real-valued FLT array of length nf1/2+1. Single thread only, but shouldn't matter since mostly data movement. @@ -305,7 +305,7 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI This could be removed by passing in an inverse kernel and doing mults. todo: rewrite w/ C++-complex I/O, check complex divide not slower than - real divide, or is there a way to force a real divide? + real divide, or is there a way to force a real divide? Barnett 1/25/17. Fixed ms=0 case 3/14/17. modeord flag & clean 10/25/17 */ @@ -328,7 +328,8 @@ void deconvolveshuffle1d(int dir, FLT prefac, FLT *ker, BIGINT ms, FLT *fk, BIGI fk[pn++] = prefac * fw[nf1 + k][1] / ker[-k]; // im } } else { // read fk, write out to fw w/ zero padding... - for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where needed + for (BIGINT k = kmax + 1; k < nf1 + kmin; ++k) { // zero pad precisely where + // needed fw[k][0] = fw[k][1] = 0.0; } for (BIGINT k = 0; k <= kmax; ++k) { // non-neg freqs k @@ -351,14 +352,14 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) + 1: use FFT-style (pos then negative, on each dim) fk is complex array stored as 2*ms*mt FLTs alternating re,im parts, with - ms looped over fast and mt slow. + ms looped over fast and mt slow. fw is a FFTW style complex array, ie FLT [nf1*nf2][2], essentially FLTs - alternating re,im parts; again nf1 is fast and nf2 slow. + alternating re,im parts; again nf1 is fast and nf2 slow. ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1 - respectively. + respectively. Barnett 2/1/17, Fixed mt=0 case 3/14/17. modeord 10/25/17 */ @@ -372,7 +373,8 @@ void deconvolveshuffle2d(int dir, FLT prefac, FLT *ker1, FLT *ker2, BIGINT ms, B pn = 2 * (k2max + 1) * ms; } // or, instead, FFT ordering if (dir == 2) // zero pad needed x-lines (contiguous in memory) - for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all dims + for (BIGINT j = nf1 * (k2max + 1); j < nf1 * (nf2 + k2min); ++j) // sweeps all + // dims fw[j][0] = fw[j][1] = 0.0; for (BIGINT k2 = 0; k2 <= k2max; ++k2, pp += 2 * ms) // non-neg y-freqs // point fk and fw to the start of this y value's row (2* is for complex): @@ -393,14 +395,14 @@ void deconvolveshuffle3d(int dir, FLT prefac, FLT *ker1, FLT *ker2, FLT *ker3, B if dir==2: copies fk to fw (and zero pads rest of it), same amplification. modeord=0: use CMCL-compatible mode ordering in fk (each dim increasing) - 1: use FFT-style (pos then negative, on each dim) + 1: use FFT-style (pos then negative, on each dim) fk is complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with - ms looped over fastest and mu slowest. + ms looped over fastest and mu slowest. fw is a FFTW style complex array, ie FLT [nf1*nf2*nf3][2], effectively - FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest. + FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest. ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1, - and nf3/2+1 respectively. + and nf3/2+1 respectively. Barnett 2/1/17, Fixed mu=0 case 3/14/17. modeord 10/25/17 */ @@ -437,7 +439,7 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *cBatch) Returns 0 (no error reporting for now). Notes: 1) cBatch is already assumed to have the correct offset, ie here we - read from the start of cBatch (unlike Malleo). fwBatch also has zero offset + read from the start of cBatch (unlike Malleo). fwBatch also has zero offset 2) this routine is a batched version of spreadinterpSorted in spreadinterp.cpp Barnett 5/19/20, based on Malleo 2019. */ @@ -602,7 +604,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, p->tol = tol; p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - // choose overall # threads... + // choose overall # threads... #ifdef _OPENMP int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); int nthr = ompmaxnthr; // default: use as many as OMP gives us @@ -653,8 +655,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, if (type == 3) // could move to setpts, more known? p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, typ - // tol, 12-core xeon + (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, + // typ tol, 12-core xeon p->opts.upsampfac = 1.25; } if (p->opts.debug > 1) @@ -723,8 +725,9 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, } if (p->opts.debug) { // "long long" here is to avoid warnings with printf... - printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) (nf1,nf2,nf3)=(%lld,%lld,%lld)\n " - " ntrans=%d nthr=%d batchSize=%d ", + printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " + "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " + "batchSize=%d ", __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, p->batchSize); @@ -768,16 +771,16 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT *n_modes, int iflag, int ntrans, timer.restart(); // plan the FFTW int *ns = GRIDSIZE_FOR_FFTW(p); - // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, - // ot, onembed, ostride, odist, sign, flags + // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, + // idist, ot, onembed, ostride, odist, sign, flags { std::lock_guard lock(fftw_lock); // FFTW_PLAN_TH sets all future fftw_plan calls to use nthr_fft threads. - // FIXME: Since this might override what the user wants for fftw, we'd like to set - // it just for our one plan and then revert to the user value. Unfortunately - // fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and there isn't a - // convenient mechanism to probe the version + // FIXME: Since this might override what the user wants for fftw, we'd like to + // set it just for our one plan and then revert to the user value. + // Unfortunately fftw_planner_nthreads wasn't introduced until fftw 3.3.9, and + // there isn't a convenient mechanism to probe the version FFTW_PLAN_TH(nthr_fft); p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, @@ -839,9 +842,9 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT if (ier) // no warnings allowed here return ier; timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts calls - // causing memory leak. We don't know it is the same size as before, so we have to - // malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. if (p->sortIndices) free(p->sortIndices); p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { @@ -966,7 +969,8 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT FLT phase = p->t3P.D1 * xj[j]; if (d > 1) phase += p->t3P.D2 * yj[j]; if (d > 2) phase += p->t3P.D3 * zj[j]; - p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler e^{+-i.phase} + p->prephase[j] = cos(phase) + imasign * sin(phase); // Euler + // e^{+-i.phase} } } else for (BIGINT j = 0; j < nj; ++j) @@ -977,9 +981,11 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT for (BIGINT k = 0; k < nk; ++k) { p->Sp[k] = p->t3P.h1 * p->t3P.gam1 * (s[k] - p->t3P.D1); // so |s'_k| < pi/R if (d > 1) - p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < pi/R + p->Tp[k] = p->t3P.h2 * p->t3P.gam2 * (t[k] - p->t3P.D2); // so |t'_k| < + // pi/R if (d > 2) - p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < pi/R + p->Up[k] = p->t3P.h3 * p->t3P.gam3 * (u[k] - p->t3P.D3); // so |u'_k| < + // pi/R } // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)... @@ -1024,9 +1030,9 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT // Set up sort for spreading Cp (from primed NU src pts X, Y, Z) to fw... timer.restart(); - // Free sortIndices if it has been allocated before in case of repeated setpts calls - // causing memory leak. We don't know it is the same size as before, so we have to - // malloc each time. + // Free sortIndices if it has been allocated before in case of repeated setpts + // calls causing memory leak. We don't know it is the same size as before, so we + // have to malloc each time. if (p->sortIndices) free(p->sortIndices); p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT) * p->nj); if (!p->sortIndices) { @@ -1073,16 +1079,16 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT *xj, FLT *yj, FLT *zj, BIGINT int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { /* See ../docs/cguru.doc for current documentation. - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. - */ + For given (stack of) weights cj or coefficients fk, performs NUFFTs with + existing (sorted) NU pts and existing plan. + For type 1 and 3: cj is input, fk is output. + For type 2: fk is input, cj is output. + Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate + for each of the 3 types. + For cases of ntrans>1, performs work in blocks of size up to batchSize. + Return value 0 (no error diagnosis yet). + Barnett 5/20/20, based on Malleo 2019. +*/ CNTime timer; timer.start(); @@ -1148,7 +1154,8 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { // for (BIGINT j=0;j<10;++j) printf("\tcj[%ld]=%.15g+%.15gi\n",(long // int)j,(double)real(cj[j]),(double)imag(cj[j])); // debug - double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, t_deconv = 0.0; // accumulated timings + double t_pre = 0.0, t_spr = 0.0, t_t2 = 0.0, + t_deconv = 0.0; // accumulated timings if (p->opts.debug) printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n", __func__, p->ntrans, p->nbatch, p->batchSize); @@ -1180,14 +1187,15 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX *cj, CPX *fk) { t_spr += timer.elapsedsec(); // for (int j=0;jnf1;++j) - // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // debug + // printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // + // debug // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array - still the same size, as Andrea explained; just wastes a few flops) */ + still the same size, as Andrea explained; just wastes a few flops) */ FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX *)(p->fwBatch)); t_t2 += timer.elapsedsec(); From 79c3ea93f1492a896ca138295b5c8dc5cc325a73 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 6 Jun 2024 18:19:24 -0400 Subject: [PATCH 04/35] restored untouched files --- src/simpleinterfaces.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/simpleinterfaces.cpp b/src/simpleinterfaces.cpp index 8e55eab4a..edd25adfb 100644 --- a/src/simpleinterfaces.cpp +++ b/src/simpleinterfaces.cpp @@ -31,13 +31,7 @@ int invokeGuruInterface(int n_dims, int type, int n_transf, BIGINT nj, FLT *xj, { FINUFFT_PLAN plan; int ier = FINUFFT_MAKEPLAN(type, n_dims, n_modes, iflag, n_transf, eps, &plan, - popts); // popts - // (ptr - // to - // opts) - // can - // be - // NULL + popts); // popts (ptr to opts) can be NULL if (ier > 1) { // since 1 (a warning) still allows proceeding... fprintf(stderr, "FINUFFT invokeGuru: plan error (ier=%d)!\n", ier); delete plan; From 4af4d1701c5aacd365b344616e4f49c71d75dde5 Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Thu, 6 Jun 2024 23:05:29 -0400 Subject: [PATCH 05/35] drafted makefile --- makefile | 53 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 46 insertions(+), 7 deletions(-) diff --git a/makefile b/makefile index 8db73a45e..d6d6fdf62 100644 --- a/makefile +++ b/makefile @@ -49,6 +49,15 @@ MKOCTFILE = mkoctfile OFLAGS = # For experts only, location of MWrap executable (see docs/install.rst): MWRAP = mwrap + +# depenency root +DEPS_ROOT := deps + +# xsimd repo url +XSIMD_URL := https://github.com/xtensor-stack/xsimd.git +XSIMD_VERSION := 13.0.0 +XSIMD_DIR := $(DEPS_ROOT)/xsimd + # absolute path of this makefile, ie FINUFFT's top-level directory... FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) @@ -60,7 +69,7 @@ FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) # -fPIC (position-indep code) needed to build dyn lib (.so) # Also, we force return (via :=) to the land of simply-expanded variables... INCL = -Iinclude -CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++14 +CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++17 CFLAGS := $(CFLAGS) $(INCL) -fPIC # here /usr/include needed for fftw3.f "fortran header"... (JiriK: no longer) FFLAGS := $(FFLAGS) $(INCL) -I/usr/include -fPIC @@ -116,7 +125,7 @@ OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) default: usage -all: test perftest lib examples fortran matlab octave python +all: test perftest lib examples fortran matlab octave python setup usage: @echo "Makefile for FINUFFT library. Please specify your task:" @@ -165,14 +174,14 @@ src/spreadinterp.o: src/ker_horner_allw_loop.c src/ker_lowupsampfac_horner_allw_ # lib ----------------------------------------------------------------------- # build library with double/single prec both bundled in... lib: $(STATICLIB) $(DYNLIB) -$(STATICLIB): $(OBJSD) +$(STATICLIB): setup $(OBJSD) ar rcs $(STATICLIB) $(OBJSD) ifeq ($(OMP),OFF) @echo "$(STATICLIB) built, single-thread version" else @echo "$(STATICLIB) built, multithreaded version" endif -$(DYNLIB): $(OBJSD) +$(DYNLIB): setup $(OBJSD) # using *absolute* path in the -o here is needed to make portable executables # when compiled against it, in mac OSX, strangely... $(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBSFFT) @@ -190,7 +199,7 @@ endif # examples (C++/C) ----------------------------------------------------------- # build all examples (single-prec codes separate, and not all have one)... EXAMPLES = $(basename $(wildcard examples/*.c examples/*.cpp)) -examples: $(EXAMPLES) +examples: setup $(EXAMPLES) ifneq ($(MINGW),ON) # Windows-MSYS does not find the dynamic libraries, so we make a temporary copy # Windows-MSYS has same commands as Linux/OSX @@ -406,7 +415,35 @@ wheel: $(STATICLIB) $(DYNLIB) docker-wheel: docker run --rm -e package_name=finufft -v `pwd`:/io libinlu/manylinux2010_x86_64_fftw /io/python/ci/build-wheels.sh - +# =============================== SETUP ==================================== + +define clone_repo + @echo "Cloning repository $(1) at tag $(2) into directory $(3)" + @if [ ! -d "$(3)" ]; then \ + git clone --branch $(2) $(1) $(3); \ + else \ + cd $(3) && \ + CURRENT_VERSION=$$(git describe --tags --abbrev=0) && \ + if [ "$$CURRENT_VERSION" = "$(2)" ]; then \ + echo "Directory $(3) already exists and is at the correct version $(2)."; \ + else \ + echo "Directory $(3) exists but is at version $$CURRENT_VERSION. Checking out the correct version $(2)."; \ + git fetch --tags && \ + git checkout $(2) || { echo "Error: Failed to checkout version $(2) in $(3)."; exit 1; }; \ + fi; \ + fi +endef + +setup: + @echo "Downloading dependencies..." + @echo "Downloading xsimd..." + mkdir -p $(DEPS_ROOT) + $(call clone_repo,$(XSIMD_URL),$(XSIMD_VERSION),$(XSIMD_DIR)) + @echo "xsimd downloaded in deps/xsimd" + CXXFLAGS += -I$(XSIMD_DIR)/include + +setupclean: + rm -rf $(DEPS_ROOT) # =============================== DOCUMENTATION ============================= @@ -420,7 +457,7 @@ docs/matlabhelp.doc: docs/genmatlabhelp.sh matlab/*.sh matlab/*.docsrc matlab/*. # =============================== CLEAN UP ================================== -clean: objclean pyclean +clean: objclean pyclean setupclean ifneq ($(MINGW),ON) # non-Windows-WSL clean up... rm -f $(STATICLIB) $(DYNLIB) @@ -440,6 +477,7 @@ else del examples\core, test\core, perftest\core, $(subst /,\, $(FE_DIR))\core endif + # indiscriminate .o killer; needed before changing threading... objclean: ifneq ($(MINGW),ON) @@ -473,3 +511,4 @@ else # Windows-WSL... del matlab\finufft_plan.m matlab\finufft.cpp matlab\finufft.mex* endif + From 5d9acdfd2369fb6ee7b8f7edf6b5eb04a6c9c13b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 7 Jun 2024 11:02:36 -0400 Subject: [PATCH 06/35] makefile pulls xsimd --- makefile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/makefile b/makefile index d6d6fdf62..709b5cf3c 100644 --- a/makefile +++ b/makefile @@ -142,6 +142,7 @@ usage: @echo " make spreadtestall - small set spreader-only tests for CI use" @echo " make objclean - remove all object files, preserving libs & MEX" @echo " make clean - also remove all lib, MEX, py, and demo executables" + @echo " make setup - download dependencies" @echo "For faster (multicore) making, append, for example, -j8" @echo "" @echo "Make options:" @@ -154,9 +155,9 @@ usage: HEADERS = $(wildcard include/*.h include/finufft/*.h) # implicit rules for objects (note -o ensures writes to correct dir) -%.o: %.cpp $(HEADERS) +%.o: %.cpp $(HEADERS) setup $(CXX) -c $(CXXFLAGS) $< -o $@ -%_32.o: %.cpp $(HEADERS) +%_32.o: %.cpp $(HEADERS) setup $(CXX) -DSINGLE -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ @@ -174,14 +175,14 @@ src/spreadinterp.o: src/ker_horner_allw_loop.c src/ker_lowupsampfac_horner_allw_ # lib ----------------------------------------------------------------------- # build library with double/single prec both bundled in... lib: $(STATICLIB) $(DYNLIB) -$(STATICLIB): setup $(OBJSD) +$(STATICLIB): $(OBJSD) ar rcs $(STATICLIB) $(OBJSD) ifeq ($(OMP),OFF) @echo "$(STATICLIB) built, single-thread version" else @echo "$(STATICLIB) built, multithreaded version" endif -$(DYNLIB): setup $(OBJSD) +$(DYNLIB): $(OBJSD) # using *absolute* path in the -o here is needed to make portable executables # when compiled against it, in mac OSX, strangely... $(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBSFFT) @@ -199,7 +200,7 @@ endif # examples (C++/C) ----------------------------------------------------------- # build all examples (single-prec codes separate, and not all have one)... EXAMPLES = $(basename $(wildcard examples/*.c examples/*.cpp)) -examples: setup $(EXAMPLES) +examples: $(EXAMPLES) ifneq ($(MINGW),ON) # Windows-MSYS does not find the dynamic libraries, so we make a temporary copy # Windows-MSYS has same commands as Linux/OSX From 311d18eef594126ee77ad0b63fc5ddbbe543aab8 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 7 Jun 2024 12:20:30 -0400 Subject: [PATCH 07/35] using uint8_t to be compatible with MSVC --- .github/workflows/C++.yml | 1 - .github/workflows/python_wheel.yml | 3 +++ src/spreadinterp.cpp | 12 ++++++------ 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/.github/workflows/C++.yml b/.github/workflows/C++.yml index 6c16c9507..d93ac3720 100644 --- a/.github/workflows/C++.yml +++ b/.github/workflows/C++.yml @@ -74,7 +74,6 @@ jobs: steps: - uses: actions/checkout@v4 - - name: 'Setup MSYS2' uses: msys2/setup-msys2@v2 with: diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml index 282fc93fa..688589ab0 100644 --- a/.github/workflows/python_wheel.yml +++ b/.github/workflows/python_wheel.yml @@ -186,6 +186,9 @@ jobs: steps: - uses: actions/checkout@v4 + - uses: git-for-windows/setup-git-for-windows-sdk + with: + flavor: minimal - name: Install GCC and make run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw" diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 86566403e..61533a018 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1254,7 +1254,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( static_assert(batches > 0, "batches must be greater than 0"); batch_t ker1val_batches[batches]; - for (u_int8_t i = 0; i < (batches & ~1); i += 2) { + for (uint8_t i = 0; i < (batches & ~1); i += 2) { const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); const auto ker00 = xsimd::swizzle(ker01, zip_low_index); const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); @@ -1272,7 +1272,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const auto j = size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid auto *FINUFFT_RESTRICT trg = du + 2 * j; const batch_t kerval_batch(ker2[dy]); - for (u_int8_t i = 0; i < batches; ++i) { + for (uint8_t i = 0; i < batches; ++i) { const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); result.store_unaligned(trg + i * avx_size); @@ -1366,7 +1366,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( // we need to handle the last batch separately // to the & ~1 is to ensure that we do not iterate over the last batch if it is odd // as it sets the last bit to 0 - for (u_int8_t i = 0; i < (batches & ~1); i += 2) { + for (uint8_t i = 0; i < (batches & ~1); i += 2) { const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); const auto ker00 = xsimd::swizzle(ker01, zip_low_index); const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); @@ -1383,14 +1383,14 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( ker1val_batches[batches - 1] = res; } // critical inner loop: - for (u_int8_t dz{0}; dz < ns; ++dz) { + for (uint8_t dz{0}; dz < ns; ++dz) { const auto oz = size1 * size2 * (i3 - off3 + dz); // offset due to z - for (u_int8_t dy{0}; dy < ns; ++dy) { + for (uint8_t dy{0}; dy < ns; ++dy) { const auto j = oz + size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid auto *FINUFFT_RESTRICT trg = du + 2 * j; const auto kerval = ker2[dy] * ker3[dz]; const batch_t kerval_batch(kerval); - for (u_int8_t i{0}; i < batches; ++i) { + for (uint8_t i{0}; i < batches; ++i) { const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); result.store_unaligned(trg + i * avx_size); From 1409c3bde40025d734154002d126fb5c86b8b229 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 7 Jun 2024 14:56:42 -0400 Subject: [PATCH 08/35] refactored spreader function --- devel/padding.cpp | 2 +- src/ker_horner_allw_loop_constexpr.h | 7 +- src/spreadinterp.cpp | 325 +++++++++++++++------------ 3 files changed, 184 insertions(+), 150 deletions(-) diff --git a/devel/padding.cpp b/devel/padding.cpp index 844a4db17..e8b337bfd 100644 --- a/devel/padding.cpp +++ b/devel/padding.cpp @@ -74,7 +74,7 @@ template static constexpr auto GetPaddedSIMDSize() { template static constexpr auto get_padding() { constexpr uint16_t width = GetPaddedSIMDSize(); - return ns % width == 0 ? 0 : width - (ns % width); + return ((ns + width - 1) & (-width)) - ns; } template diff --git a/src/ker_horner_allw_loop_constexpr.h b/src/ker_horner_allw_loop_constexpr.h index 25a791ddb..a29fea019 100644 --- a/src/ker_horner_allw_loop_constexpr.h +++ b/src/ker_horner_allw_loop_constexpr.h @@ -3,8 +3,11 @@ // (C) The Simons Foundation, Inc. #include -template -constexpr std::array, nc> get_horner_coeffs() noexcept { +template constexpr auto nc200() noexcept { return w + 2 + (w <= 8); } + +template +constexpr std::array, nc200()> get_horner_coeffs_200() noexcept { + constexpr auto nc = nc200(); if constexpr (w == 2) { return std::array, nc>{ {{4.5147043243215315E+01, 4.5147043243215300E+01}, diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 61533a018..0ee348d3f 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -25,16 +25,16 @@ namespace { // anonymous namespace for internal structs equivalent to declaring struct zip_low; struct zip_hi; // forward declaration to clean up the code and be able to use this everywhere in the file -template static constexpr auto BestSIMDHelper(); -template constexpr auto GetPaddedSIMDSize(); -template +template static constexpr auto BestSIMDHelper(); +template constexpr auto GetPaddedSIMDSize(); +template using PaddedSIMD = typename xsimd::make_sized_batch()>::type; -template uint16_t get_padding(uint8_t ns); +template uint8_t get_padding(uint8_t ns); template constexpr auto get_padding(); -template +template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template constexpr uint16_t min_batch_size(); -template constexpr auto find_optimal_batch_size(); +template constexpr uint8_t min_batch_size(); +template constexpr auto find_optimal_batch_size(); template constexpr auto initialize_complex_batch(V a, V b) noexcept; template @@ -43,6 +43,13 @@ constexpr auto zip_low_index = template constexpr auto zip_hi_index = xsimd::make_batch_constant, arch_t, zip_hi>(); +template +constexpr std::array, N> pad_2D_array_with_zeros( + const std::array, N> &input) noexcept; +FINUFFT_NEVER_INLINE +void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, + BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, + BIGINT M0); } // namespace // declarations of purely internal functions... (thus need not be in .h) template()>> // aka ns +template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; static void interp_line(FLT *out, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns); @@ -323,12 +330,10 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat if (opts.debug) printf("\tspread %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); - for (BIGINT i = 0; i < 2 * N; i++) // zero the output array. std::fill is no faster - data_uniform[i] = 0.0; + std::fill(data_uniform, data_uniform + 2 * N, 0.0); // zero the output array if (opts.debug) printf("\tzero output array\t%.3g s\n", timer.elapsedsec()); - if (M == 0) // no NU pts, we're done + if (M == 0) // no NU pts, we're done return 0; int spread_single = (nthr == 1) || (M * 100 < N); // low-density heuristic? @@ -340,11 +345,10 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // ... (question is: will the index wrapping per NU pt slow it down?) } if (opts.debug) printf("\tt1 simple spreading:\t%.3g s\n", timer.elapsedsec()); - } else { // ------- Fancy multi-core blocked t1 spreading ---- // Splits sorted inds (jfm's advanced2), could double RAM. // choose nb (# subprobs) via used nthreads: - int nb = min((BIGINT)nthr, M); // simply split one subprob per thr... + auto nb = min((BIGINT)nthr, M); // simply split one subprob per thr... if (nb * (BIGINT)opts.max_subproblem_size < M) { // ...or more subprobs to cap size nb = 1 + (M - 1) / opts.max_subproblem_size; // int div does // ceil(M/opts.max_subproblem_size) @@ -363,84 +367,70 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems - for (int p = 0; p <= nb; ++p) brk[p] = (BIGINT)(0.5 + M * p / (double)nb); - -#pragma omp parallel for num_threads(nthr) schedule(dynamic, 1) // each is big - for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems - BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem - // copy the location and data vectors for the nonuniform points - FLT *kx0 = (FLT *)malloc(sizeof(FLT) * M0), *ky0 = NULL, *kz0 = NULL; - if (N2 > 1) ky0 = (FLT *)malloc(sizeof(FLT) * M0); - if (N3 > 1) kz0 = (FLT *)malloc(sizeof(FLT) * M0); - FLT *dd0 = (FLT *)malloc(sizeof(FLT) * M0 * 2); // complex strength data - for (BIGINT j = 0; j < M0; j++) { // todo: can avoid this copying? - BIGINT kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list - kx0[j] = fold_rescale(kx[kk], N1); - if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); - if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); - dd0[j * 2] = data_nonuniform[kk * 2]; // real part - dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part - } - // get the subgrid which will include padding by roughly nspread/2 - BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; // get_subgrid - // sets - get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, kx0, - ky0, kz0, ns, - ndims); // sets offsets and sizes - if (opts.debug > 1) { // verbose - printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); - if (ndims == 1) - printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, - (long long)padded_size1, (long long)M0); - else if (ndims == 2) - printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)padded_size1, - (long long)size2, (long long)M0); - else - printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)offset3, - (long long)padded_size1, (long long)size2, (long long)size3, - (long long)M0); - } - // allocate output data for this subgrid - FLT *du0 = (FLT *)malloc(sizeof(FLT) * 2 * padded_size1 * size2 * size3); // complex - - // Spread to subgrid without need for bounds checking or wrapping - if (!(opts.flags & TF_OMIT_SPREADING)) { - if (ndims == 1) - spread_subproblem_1d(offset1, padded_size1, du0, M0, kx0, dd0, opts); - else if (ndims == 2) - spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0, M0, kx0, ky0, - dd0, opts); - else - spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, du0, - M0, kx0, ky0, kz0, dd0, opts); - } + for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; - // do the adding of subgrid to output - if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { - if (nthr > opts.atomic_threshold) // see above for debug reporting - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, - size3, N1, N2, N3, data_uniform, du0); // R - // Blackwell's - // atomic - // version - else { +#pragma omp parallel num_threads(nthr) + { + // local copies of NU pts and data for each subproblem + std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; +#pragma omp for schedule(dynamic, 1) // each is big + for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems + BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem + // copy the location and data vectors for the nonuniform points + kx0.resize(M0); + ky0.resize(M0 * (N2 > 1)); + kz0.resize(M0 * (N3 > 1)); + dd0.resize(2 * M0); // complex strength data + for (UBIGINT j = 0; j < M0; j++) { // todo: can avoid this copying? + const UBIGINT kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list + kx0[j] = fold_rescale(kx[kk], N1); + if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); + if (N3 > 1) kz0[j] = fold_rescale(kz[kk], N3); + dd0[j * 2] = data_nonuniform[kk * 2]; // real part + dd0[j * 2 + 1] = data_nonuniform[kk * 2 + 1]; // imag part + } + // get the subgrid which will include padding by roughly nspread/2 + // get_subgrid sets + BIGINT offset1, offset2, offset3, padded_size1, size1, size2, size3; + // sets offsets and sizes + get_subgrid(offset1, offset2, offset3, padded_size1, size1, size2, size3, M0, + kx0.data(), ky0.data(), kz0.data(), ns, ndims); + if (opts.debug > 1) { + print_subgrid_info(ndims, offset1, offset2, offset3, padded_size1, size1, size2, + size3, M0); + } + // allocate output data for this subgrid + du0.resize(2 * padded_size1 * size2 * size3); // complex + // Spread to subgrid without need for bounds checking or wrapping + if (!(opts.flags & TF_OMIT_SPREADING)) { + if (ndims == 1) + spread_subproblem_1d(offset1, padded_size1, du0.data(), M0, kx0.data(), + dd0.data(), opts); + else if (ndims == 2) + spread_subproblem_2d(offset1, offset2, padded_size1, size2, du0.data(), M0, + kx0.data(), ky0.data(), dd0.data(), opts); + else + spread_subproblem_3d(offset1, offset2, offset3, padded_size1, size2, size3, + du0.data(), M0, kx0.data(), ky0.data(), kz0.data(), + dd0.data(), opts); + } + // do the adding of subgrid to output + if (!(opts.flags & TF_OMIT_WRITE_TO_GRID)) { + if (nthr > opts.atomic_threshold) { // see above for debug reporting + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); // R Blackwell's atomic version + } else { #pragma omp critical - add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, - size2, size3, N1, N2, N3, data_uniform, du0); + add_wrapped_subgrid(offset1, offset2, offset3, padded_size1, size1, + size2, size3, N1, N2, N3, data_uniform, + du0.data()); + } } - } - - // free up stuff from this subprob... (that was malloc'ed by hand) - free(dd0); - free(du0); - free(kx0); - if (N2 > 1) free(ky0); - if (N3 > 1) free(kz0); - } // end main loop over subprobs + } // end main loop over subprobs + } if (opts.debug) - printf("\tt1 fancy spread: \t%.3g s (%d subprobs)\n", timer.elapsedsec(), nb); + printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); } // end of choice of which t1 spread type to use return 0; }; @@ -712,64 +702,40 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; } -template -constexpr std::array pad_with_zeros(const std::array &input) { - std::array output{0}; - for (auto i = 0; i < N; ++i) { - output[i] = input[i]; - } - return output; -} - -template -constexpr std::array, N> pad_2D_array_with_zeros( - std::array, N> &&input) { - std::array, N> output{}; - for (std::size_t i = 0; i < N; ++i) { - output[i] = pad_with_zeros(input[i]); - } - return output; -} - -template // aka ns +template // aka ns void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, const finufft_spread_opts &opts) noexcept /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. This is the current evaluation method, since it's faster (except i7 w=16). Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ + { const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in // [-1,1] if (opts.upsampfac == 2.0) { // floating point equality is fine here - static constexpr const auto nc = []() { - static_assert(w <= 16, "w must be <= 16"); - static_assert(w >= 2, "w must be >= 2"); - if constexpr (w < 10) { - return w + 3; - } else { - return w + 2; - } - }(); - static constexpr const auto alignment = batch_t::arch_type::alignment(); - static constexpr const auto avx_size = batch_t::size; - static constexpr const auto padded_ns = (w + avx_size - 1) & ~(avx_size - 1); - alignas(alignment) static constexpr const auto ci = - pad_2D_array_with_zeros(get_horner_coeffs()); - alignas(alignment) const std::array zs = [](const FLT z) noexcept { - std::array zs_v{}; - auto sz = z; - for (uint8_t i = 0; i < nc - 1; ++i) { - zs_v[i] = batch_t(sz); - sz *= z; - } - return zs_v; - }(z); + static constexpr auto alignment = batch_t::arch_type::alignment(); + static constexpr auto avx_size = batch_t::size; + static constexpr auto padded_ns = (w + avx_size - 1) & ~(avx_size - 1); + static constexpr auto nc = nc200(); + static constexpr auto horner_coeffs = get_horner_coeffs_200(); + alignas(alignment) static constexpr auto padded_coeffs = + pad_2D_array_with_zeros(horner_coeffs); + alignas(alignment) const std::array pow_z = + [](const FLT z) constexpr noexcept { + std::array zs_v{}; + auto sz = z; + for (uint8_t i = 0; i < nc - 1; ++i) { + zs_v[i] = batch_t(sz); + sz *= z; + } + return zs_v; + }(z); for (uint8_t i = 0; i < w; i += avx_size) { - auto k = batch_t::load_aligned(ci[0].data() + i); + auto k = batch_t::load_aligned(padded_coeffs[0].data() + i); for (uint8_t j = 1; j < nc; ++j) { - const auto cji = batch_t::load_aligned(ci[j].data() + i); - k = xsimd::fma(cji, zs[j - 1], k); + const auto cji = batch_t::load_aligned(padded_coeffs[j].data() + i); + k = xsimd::fma(cji, pow_z[j - 1], k); } k.store_aligned(ker + i); } @@ -1764,6 +1730,24 @@ auto ker_eval(const finufft_spread_opts &opts, const V... elems) noexcept { } namespace { + +template +constexpr array, N> pad_2D_array_with_zeros( + const array, N> &input) noexcept { + constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { + std::array padded{0}; + for (auto i = 0; i < input.size(); ++i) { + padded[i] = input[i]; + } + return padded; + }; + std::array, N> output{}; + for (std::size_t i = 0; i < N; ++i) { + output[i] = pad_with_zeros(input[i]); + } + return output; +} + template constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { // utility function to generate a sequence of a, b interleaved as function arguments @@ -1786,7 +1770,7 @@ constexpr auto initialize_complex_batch(V a, V b) noexcept { // this finds the largest SIMD instruction set that can handle N elements // void otherwise -> compile error -template constexpr auto BestSIMDHelper() { +template constexpr auto BestSIMDHelper() { if constexpr (N % K == 0) { // returns void in the worst case return xsimd::make_sized_batch{}; } else { @@ -1794,7 +1778,9 @@ template constexpr auto BestSIMDHelper() { } } -template constexpr uint16_t min_batch_size() { +template constexpr uint8_t min_batch_size() { + // finds the smallest batch size that can handle N elements + // batch size is the SIMD width in xsimd terminology if constexpr (std::is_void_v>) { return min_batch_size(); } else { @@ -1802,13 +1788,16 @@ template constexpr uint16_t min_batch_size() { } }; -template constexpr auto find_optimal_batch_size() { - uint16_t optimal_batch_size = min_batch_size(); - uint16_t min_iterations = (N + optimal_batch_size - 1) / optimal_batch_size; - for (uint16_t batch_size = optimal_batch_size; +template constexpr auto find_optimal_batch_size() { + // finds the smallest batch size that minimizes the number of iterations + // NOTE: might be suboptimal for some cases 2^N+1 for example + // in the future we might want to implement a more sophisticated algorithm + uint8_t optimal_batch_size = min_batch_size(); + uint8_t min_iterations = (N + optimal_batch_size - 1) / optimal_batch_size; + for (uint8_t batch_size = optimal_batch_size; batch_size <= xsimd::batch::size; batch_size *= 2) { - uint16_t iterations = (N + batch_size - 1) / batch_size; + uint8_t iterations = (N + batch_size - 1) / batch_size; if (iterations < min_iterations) { min_iterations = iterations; optimal_batch_size = batch_size; @@ -1817,16 +1806,27 @@ template constexpr auto find_optimal_batch_size() { return optimal_batch_size; } -template constexpr auto GetPaddedSIMDSize() { +template constexpr auto GetPaddedSIMDSize() { + // helper function to get the SIMD size with padding for the given number of elements + // that minimizes the number of iterations return xsimd::make_sized_batch()>::type::size; } template constexpr auto get_padding() { - constexpr uint16_t width = GetPaddedSIMDSize(); - return ns % width == 0 ? 0 : width - (ns % width); + // helper function to get the padding for the given number of elements + // ns is known at compile time + // rounds ns to the next multiple of the SIMD width + // then subtracts ns to get the padding + constexpr uint8_t width = GetPaddedSIMDSize(); + return ((ns + width - 1) & (-width)) - ns; } -template constexpr auto get_padding_helper(uint16_t runtime_ns) { +template constexpr auto get_padding_helper(uint8_t runtime_ns) { + // helper function to get the padding for the given number of elements where ns is + // known at runtime, it uses recursion to find the padding + // this allows to avoid having a function with a large number of switch cases + // as GetPaddedSIMDSize requires a compile time value + // it cannot be a lambda function because of the template recursion if constexpr (ns < 2) { return 0; } else { @@ -1838,17 +1838,48 @@ template constexpr auto get_padding_helper(uint16_t runtime } } -template uint16_t get_padding(uint8_t ns) { +template uint8_t get_padding(uint8_t ns) { + // return the padding as a function of the number of elements + // 2 * MAX_NSPREAD is the maximum number of elements that we can have + // that's why is hardcoded here return get_padding_helper(ns); } struct zip_low { + // helper struct to get the lower half of a SIMD register and zip it with itself + // it returns index 0, 0, 1, 1, ... N/2, N/2 static constexpr unsigned get(unsigned index, unsigned /*size*/) { return index / 2; } }; struct zip_hi { + // helper struct to get the upper half of a SIMD register and zip it with itself + // it returns index N/2, N/2, N/2+1, N/2+1, ... N, N static constexpr unsigned get(unsigned index, unsigned size) { return (size + index) / 2; } }; + +void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, + BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, + BIGINT M0) { + printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); + switch (ndims) { + case 1: + printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, + (long long)padded_size1, (long long)M0); + break; + case 2: + printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, + (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); + break; + case 3: + printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", + (long long)offset1, (long long)offset2, (long long)offset3, + (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); + break; + default: + printf("Invalid number of dimensions: %d\n", ndims); + break; + } +} } // namespace } // namespace finufft::spreadinterp From c8f389a48df7afef4f863b9756c5e10b86c6048d Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 11 Jun 2024 15:04:50 -0400 Subject: [PATCH 09/35] Vectorized single thread binsort --- src/spreadinterp.cpp | 219 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 194 insertions(+), 25 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 0ee348d3f..93847d9bb 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -50,6 +50,7 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, BIGINT M0); +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N); } // namespace // declarations of purely internal functions... (thus need not be in .h) template 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) +// // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, +// // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). +// // Note that round-off near kx=-pi stably rounds negative to i1=0. +// const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); +// const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; +// const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; +// const auto nbins = nbins1 * nbins2 * nbins3; +// const auto inv_bin_size_x = FLT(1.0 / bin_size_x); +// const auto inv_bin_size_y = FLT(1.0 / bin_size_y); +// const auto inv_bin_size_z = FLT(1.0 / bin_size_z); +// // count how many pts in each bin +// std::vector counts(nbins, 0); +// +// for (auto i = 0; i < M; i++) { +// // find the bin index in however many dims are needed +// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); +// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; +// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; +// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); +// ++counts[bin]; +// } +// +// // compute the offsets directly in the counts array (no offset array) +// BIGINT current_offset = 0; +// for (BIGINT i = 0; i < nbins; i++) { +// BIGINT tmp = counts[i]; +// counts[i] = current_offset; // Reinecke's cute replacement of counts[i] +// current_offset += tmp; +// } // (counts now contains the index offsets for each bin) +// +// for (auto i = 0; i < M; i++) { +// // find the bin index (again! but better than using RAM) +// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); +// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; +// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; +// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); +// ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly +// ++counts[bin]; // update the offsets +// } +//} + +void bin_sort_singlethread( + BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, + const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x, + const double bin_size_y, const double bin_size_z, const int debug) /* Returns permutation of all nonuniform points with good RAM access, * ie less cache misses for spreading, in 1D, 2D, or 3D. Single-threaded version * @@ -1503,45 +1580,125 @@ void bin_sort_singlethread(BIGINT *ret, BIGINT M, FLT *kx, FLT *ky, FLT *kz, BIG * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). */ { + static constexpr auto alignment = xsimd::batch::arch_type::alignment(); const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); - const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; - const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins1 = UBIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? UBIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? UBIGINT(FLT(N3) / bin_size_z + 1) : 1; + const auto nbins12 = nbins1 * nbins2; const auto nbins = nbins1 * nbins2 * nbins3; const auto inv_bin_size_x = FLT(1.0 / bin_size_x); const auto inv_bin_size_y = FLT(1.0 / bin_size_y); const auto inv_bin_size_z = FLT(1.0 / bin_size_z); + + static constexpr auto avx_width = xsimd::batch::size; + const auto regular_part = M & (-avx_width); + // count how many pts in each bin - std::vector counts(nbins, 0); + std::vector counts(nbins, 0); + + static constexpr auto to_array = [](const auto bins) noexcept { + using contained_t = typename decltype(bins)::value_type; + alignas(alignment) std::array result{}; + bins.store_aligned(result.data()); + return result; + }; + + static constexpr auto to_uint = [](const xsimd::batch bins) noexcept { + return xsimd::batch_cast>(bins); + }; + + const auto compute_bins = [=](auto... args) constexpr noexcept { + std::array k_arr = {args...}; + auto bins = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); + if constexpr (sizeof...(args) > 1) { + const auto i2 = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); + bins = xsimd::fma(decltype(bins)(nbins1), i2, bins); + } + if constexpr (sizeof...(args) > 2) { + const auto i3 = xsimd::floor( + fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); + bins = xsimd::fma(decltype(bins)(nbins12), i3, bins); + } + return to_uint(bins); + }; + + const auto increment_bins = [&counts](const auto bins) constexpr noexcept { + const auto bin_array = to_array(bins); + for (const auto bin : bin_array) { + ++counts[bin]; + } + }; + + const auto accumulate_bins = [&counts, &ret](const auto bins, + const auto i) constexpr noexcept { + const auto bin_array = to_array(bins); + for (uint8_t j{0}; j < avx_width; ++j) { + const auto bin = bin_array[j]; + // fill the inverse map on the fly, careful of indexes errors + ret[counts[bin]] = i + j; + ++counts[bin]; + } + }; - for (auto i = 0; i < M; i++) { + UBIGINT i{0}; + if (iskz) { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i, ky + i, kz + i)); + } + } else if (isky) { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i, ky + i)); + } + } else { + for (; i < regular_part; i += avx_width) { + increment_bins(compute_bins(kx + i)); + } + } + + for (; i < M; ++i) { // find the bin index in however many dims are needed - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; } // compute the offsets directly in the counts array (no offset array) - BIGINT current_offset = 0; - for (BIGINT i = 0; i < nbins; i++) { - BIGINT tmp = counts[i]; - counts[i] = current_offset; // Reinecke's cute replacement of counts[i] - current_offset += tmp; + UBIGINT current_offset{0}; // Reinecke's cute replacement of counts[i] + for (i = 0; i < nbins; ++i) { + counts[i] = std::exchange(current_offset, current_offset + counts[i]); } // (counts now contains the index offsets for each bin) - for (auto i = 0; i < M; i++) { + i = 0; // we need to redo the loop so variable should be zeroed here + if (iskz) { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i, ky + i, kz + i), i); + } + } else if (isky) { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i, ky + i), i); + } + } else { + for (; i < regular_part; i += avx_width) { + accumulate_bins(compute_bins(kx + i), i); + } + } + + for (; i < M; ++i) { // find the bin index (again! but better than using RAM) - const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly - ++counts[bin]; // update the offsets + ret[counts[bin]] = i; // fill the inverse map on the fly + ++counts[bin]; // update the offsets } } @@ -1881,5 +2038,17 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset break; } } + +// since xsimd is not constexpr this slows down the loop due to the guard variables +// hence moved them here + +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(const xsimd::batch x, + const BIGINT N) { + const xsimd::batch x2pi{FLT(M_1_2PI)}; + const xsimd::batch half{FLT(0.5)}; + auto result = xsimd::fma(x, x2pi, half); + result -= xsimd::floor(result); + return result * FLT(N); +} } // namespace } // namespace finufft::spreadinterp From 10deea6e4e8ff2655c3ac22ad59f30ad603a3477 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Tue, 11 Jun 2024 20:37:26 -0400 Subject: [PATCH 10/35] using uint instead of floats --- src/spreadinterp.cpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 93847d9bb..0140e54f9 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1612,20 +1612,21 @@ void bin_sort_singlethread( }; const auto compute_bins = [=](auto... args) constexpr noexcept { - std::array k_arr = {args...}; - auto bins = xsimd::floor( - fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); + const std::array k_arr = {args...}; + // + auto bins = + to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); if constexpr (sizeof...(args) > 1) { - const auto i2 = xsimd::floor( - fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); + const auto i2 = + to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); bins = xsimd::fma(decltype(bins)(nbins1), i2, bins); } if constexpr (sizeof...(args) > 2) { - const auto i3 = xsimd::floor( - fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); + const auto i3 = + to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); bins = xsimd::fma(decltype(bins)(nbins12), i3, bins); } - return to_uint(bins); + return bins; }; const auto increment_bins = [&counts](const auto bins) constexpr noexcept { From 740fad11642188e8bc15bc6c8f30cf7c4891b7bc Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 12 Jun 2024 12:16:26 -0400 Subject: [PATCH 11/35] optimized fold ? --- src/spreadinterp.cpp | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 0140e54f9..305261f66 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -50,7 +50,8 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, BIGINT M0); -FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, BIGINT N); +FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, FLT N); +FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x); } // namespace // declarations of purely internal functions... (thus need not be in .h) template::size; const auto regular_part = M & (-avx_width); @@ -1611,22 +1618,25 @@ void bin_sort_singlethread( return xsimd::batch_cast>(bins); }; - const auto compute_bins = [=](auto... args) constexpr noexcept { + const auto compute_bins = [=](const auto... args) constexpr noexcept { const std::array k_arr = {args...}; // - auto bins = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[0]), N1) * inv_bin_size_x); + auto bins0 = to_uint(fold(xsimd::load_unaligned(k_arr[0])) * rescale1); + decltype(bins0) bins1{0u}; + decltype(bins0) bins2{0u}; if constexpr (sizeof...(args) > 1) { - const auto i2 = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[1]), N2) * inv_bin_size_y); - bins = xsimd::fma(decltype(bins)(nbins1), i2, bins); + const auto i2 = to_uint(fold(xsimd::load_unaligned(k_arr[1])) * rescale2); + bins1 = nbins1 * i2; + } else { + return bins0; } if constexpr (sizeof...(args) > 2) { - const auto i3 = - to_uint(fold_rescale_vec(xsimd::load_unaligned(k_arr[2]), N3) * inv_bin_size_z); - bins = xsimd::fma(decltype(bins)(nbins12), i3, bins); + const auto i3 = to_uint(fold(xsimd::load_unaligned(k_arr[2])) * rescale3); + bins2 = nbins12 * i3; + } else { + return bins0 + bins1; } - return bins; + return bins0 + bins1 + bins2; }; const auto increment_bins = [&counts](const auto bins) constexpr noexcept { @@ -2044,12 +2054,19 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset // hence moved them here FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(const xsimd::batch x, - const BIGINT N) { + const FLT N) { + const xsimd::batch x2pi{FLT(M_1_2PI)}; + const xsimd::batch half{FLT(0.5)}; + auto result = xsimd::fma(x, x2pi, half); + result -= xsimd::floor(result); + return result * N; +} +FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x) { const xsimd::batch x2pi{FLT(M_1_2PI)}; const xsimd::batch half{FLT(0.5)}; auto result = xsimd::fma(x, x2pi, half); result -= xsimd::floor(result); - return result * FLT(N); + return result; } } // namespace } // namespace finufft::spreadinterp From 53606d801441e0b37e5433acdc93b8c8c94ec562 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 12 Jun 2024 14:20:42 -0400 Subject: [PATCH 12/35] index-sort-vectorized --- src/spreadinterp.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 305261f66..be8e8e3cd 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1622,8 +1622,8 @@ void bin_sort_singlethread( const std::array k_arr = {args...}; // auto bins0 = to_uint(fold(xsimd::load_unaligned(k_arr[0])) * rescale1); - decltype(bins0) bins1{0u}; - decltype(bins0) bins2{0u}; + auto bins1 = to_uint(FLT(0)); + auto bins2 = to_uint(FLT(0)); if constexpr (sizeof...(args) > 1) { const auto i2 = to_uint(fold(xsimd::load_unaligned(k_arr[1])) * rescale2); bins1 = nbins1 * i2; From 28022fe2c2229cd9dfa37efa66cd69b02a03390b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Wed, 12 Jun 2024 14:26:44 -0400 Subject: [PATCH 13/35] scalar-bin-sort --- src/spreadinterp.cpp | 223 ++++--------------------------------------- 1 file changed, 19 insertions(+), 204 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index be8e8e3cd..12f8e4670 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -50,8 +50,6 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, BIGINT padded_size1, BIGINT size1, BIGINT size2, BIGINT size3, BIGINT M0); -FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(xsimd::batch x, FLT N); -FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x); } // namespace // declarations of purely internal functions... (thus need not be in .h) template 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) -// // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, -// // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). -// // Note that round-off near kx=-pi stably rounds negative to i1=0. -// const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); -// const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; -// const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; -// const auto nbins = nbins1 * nbins2 * nbins3; -// const auto inv_bin_size_x = FLT(1.0 / bin_size_x); -// const auto inv_bin_size_y = FLT(1.0 / bin_size_y); -// const auto inv_bin_size_z = FLT(1.0 / bin_size_z); -// // count how many pts in each bin -// std::vector counts(nbins, 0); -// -// for (auto i = 0; i < M; i++) { -// // find the bin index in however many dims are needed -// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); -// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; -// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; -// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); -// ++counts[bin]; -// } -// -// // compute the offsets directly in the counts array (no offset array) -// BIGINT current_offset = 0; -// for (BIGINT i = 0; i < nbins; i++) { -// BIGINT tmp = counts[i]; -// counts[i] = current_offset; // Reinecke's cute replacement of counts[i] -// current_offset += tmp; -// } // (counts now contains the index offsets for each bin) -// -// for (auto i = 0; i < M; i++) { -// // find the bin index (again! but better than using RAM) -// const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); -// const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; -// const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; -// const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); -// ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly -// ++counts[bin]; // update the offsets -// } -//} - void bin_sort_singlethread( BIGINT *ret, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const BIGINT N1, const BIGINT N2, const BIGINT N3, const double bin_size_x, @@ -1581,135 +1505,45 @@ void bin_sort_singlethread( * Simplified by Martin Reinecke, 6/19/23 (no apparent effect on speed). */ { - static constexpr auto alignment = xsimd::batch::arch_type::alignment(); const auto isky = (N2 > 1), iskz = (N3 > 1); // ky,kz avail? (cannot access if not) // here the +1 is needed to allow round-off error causing i1=N1/bin_size_x, // for kx near +pi, ie foldrescale gives N1 (exact arith would be 0 to N1-1). // Note that round-off near kx=-pi stably rounds negative to i1=0. - const auto nbins1 = UBIGINT(FLT(N1) / bin_size_x + 1); - const auto nbins2 = isky ? UBIGINT(FLT(N2) / bin_size_y + 1) : 1; - const auto nbins3 = iskz ? UBIGINT(FLT(N3) / bin_size_z + 1) : 1; - const auto nbins12 = nbins1 * nbins2; + const auto nbins1 = BIGINT(FLT(N1) / bin_size_x + 1); + const auto nbins2 = isky ? BIGINT(FLT(N2) / bin_size_y + 1) : 1; + const auto nbins3 = iskz ? BIGINT(FLT(N3) / bin_size_z + 1) : 1; const auto nbins = nbins1 * nbins2 * nbins3; const auto inv_bin_size_x = FLT(1.0 / bin_size_x); const auto inv_bin_size_y = FLT(1.0 / bin_size_y); const auto inv_bin_size_z = FLT(1.0 / bin_size_z); - const auto fN1 = FLT(N1); - const auto fN2 = FLT(N2); - const auto fN3 = FLT(N3); - const auto rescale1 = fN1 * inv_bin_size_x; - const auto rescale2 = fN2 * inv_bin_size_y; - const auto rescale3 = fN3 * inv_bin_size_z; - - static constexpr auto avx_width = xsimd::batch::size; - const auto regular_part = M & (-avx_width); - // count how many pts in each bin - std::vector counts(nbins, 0); - - static constexpr auto to_array = [](const auto bins) noexcept { - using contained_t = typename decltype(bins)::value_type; - alignas(alignment) std::array result{}; - bins.store_aligned(result.data()); - return result; - }; - - static constexpr auto to_uint = [](const xsimd::batch bins) noexcept { - return xsimd::batch_cast>(bins); - }; - - const auto compute_bins = [=](const auto... args) constexpr noexcept { - const std::array k_arr = {args...}; - // - auto bins0 = to_uint(fold(xsimd::load_unaligned(k_arr[0])) * rescale1); - auto bins1 = to_uint(FLT(0)); - auto bins2 = to_uint(FLT(0)); - if constexpr (sizeof...(args) > 1) { - const auto i2 = to_uint(fold(xsimd::load_unaligned(k_arr[1])) * rescale2); - bins1 = nbins1 * i2; - } else { - return bins0; - } - if constexpr (sizeof...(args) > 2) { - const auto i3 = to_uint(fold(xsimd::load_unaligned(k_arr[2])) * rescale3); - bins2 = nbins12 * i3; - } else { - return bins0 + bins1; - } - return bins0 + bins1 + bins2; - }; - - const auto increment_bins = [&counts](const auto bins) constexpr noexcept { - const auto bin_array = to_array(bins); - for (const auto bin : bin_array) { - ++counts[bin]; - } - }; - - const auto accumulate_bins = [&counts, &ret](const auto bins, - const auto i) constexpr noexcept { - const auto bin_array = to_array(bins); - for (uint8_t j{0}; j < avx_width; ++j) { - const auto bin = bin_array[j]; - // fill the inverse map on the fly, careful of indexes errors - ret[counts[bin]] = i + j; - ++counts[bin]; - } - }; + std::vector counts(nbins, 0); - UBIGINT i{0}; - if (iskz) { - for (; i < regular_part; i += avx_width) { - increment_bins(compute_bins(kx + i, ky + i, kz + i)); - } - } else if (isky) { - for (; i < regular_part; i += avx_width) { - increment_bins(compute_bins(kx + i, ky + i)); - } - } else { - for (; i < regular_part; i += avx_width) { - increment_bins(compute_bins(kx + i)); - } - } - - for (; i < M; ++i) { + for (auto i = 0; i < M; i++) { // find the bin index in however many dims are needed - const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); ++counts[bin]; } // compute the offsets directly in the counts array (no offset array) - UBIGINT current_offset{0}; // Reinecke's cute replacement of counts[i] - for (i = 0; i < nbins; ++i) { - counts[i] = std::exchange(current_offset, current_offset + counts[i]); + BIGINT current_offset = 0; + for (BIGINT i = 0; i < nbins; i++) { + BIGINT tmp = counts[i]; + counts[i] = current_offset; // Reinecke's cute replacement of counts[i] + current_offset += tmp; } // (counts now contains the index offsets for each bin) - i = 0; // we need to redo the loop so variable should be zeroed here - if (iskz) { - for (; i < regular_part; i += avx_width) { - accumulate_bins(compute_bins(kx + i, ky + i, kz + i), i); - } - } else if (isky) { - for (; i < regular_part; i += avx_width) { - accumulate_bins(compute_bins(kx + i, ky + i), i); - } - } else { - for (; i < regular_part; i += avx_width) { - accumulate_bins(compute_bins(kx + i), i); - } - } - - for (; i < M; ++i) { + for (auto i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) - const auto i1 = UBIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); - const auto i2 = isky ? UBIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; - const auto i3 = iskz ? UBIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; + const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); + const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; + const auto i3 = iskz ? BIGINT(fold_rescale(kz[i], N3) * inv_bin_size_z) : 0; const auto bin = i1 + nbins1 * (i2 + nbins2 * i3); - ret[counts[bin]] = i; // fill the inverse map on the fly - ++counts[bin]; // update the offsets + ret[counts[bin]] = BIGINT(i); // fill the inverse map on the fly + ++counts[bin]; // update the offsets } } @@ -2049,24 +1883,5 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset break; } } - -// since xsimd is not constexpr this slows down the loop due to the guard variables -// hence moved them here - -FINUFFT_ALWAYS_INLINE xsimd::batch fold_rescale_vec(const xsimd::batch x, - const FLT N) { - const xsimd::batch x2pi{FLT(M_1_2PI)}; - const xsimd::batch half{FLT(0.5)}; - auto result = xsimd::fma(x, x2pi, half); - result -= xsimd::floor(result); - return result * N; -} -FINUFFT_ALWAYS_INLINE xsimd::batch fold(xsimd::batch x) { - const xsimd::batch x2pi{FLT(M_1_2PI)}; - const xsimd::batch half{FLT(0.5)}; - auto result = xsimd::fma(x, x2pi, half); - result -= xsimd::floor(result); - return result; -} } // namespace } // namespace finufft::spreadinterp From 2a7753dc7896e4178bf82d09b52919cee53d407c Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 14 Jun 2024 14:55:54 -0400 Subject: [PATCH 14/35] different interleaving method --- src/spreadinterp.cpp | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12f8e4670..717946a32 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1036,7 +1036,11 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // +-----------------------+ // |re|im|re|im|re|im|re|im| // +-----------------------+ - const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); + // const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); + const auto dd_pt = [dd, i]() constexpr noexcept { + const batch_t ddi{dd[i * 2]}, ddj{dd[i * 2 + 1]}; + return xsimd::zip_lo(ddi, ddj); + }(); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index // FLT(i1) has different semantics and results an extra cast @@ -1205,7 +1209,12 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); + // const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + + // 1]); + const auto dd_pt = [dd, pt]() constexpr noexcept { + const batch_t ddi{dd[pt * 2]}, ddj{dd[pt * 2 + 1]}; + return xsimd::zip_lo(ddi, ddj); + }(); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); @@ -1309,7 +1318,12 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width std::fill(du, du + 2 * size1 * size2 * size3, 0); for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); + // const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + + // 1]); + const auto dd_pt = [dd, pt]() constexpr noexcept { + const batch_t ddi{dd[pt * 2]}, ddj{dd[pt * 2 + 1]}; + return xsimd::zip_lo(ddi, ddj); + }(); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); From b1bdbe1a252426a232e18e0f623c06b8210fcc72 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 17 Jun 2024 12:06:30 -0400 Subject: [PATCH 15/35] Revert "different interleaving method" This reverts commit 2a7753dc7896e4178bf82d09b52919cee53d407c. --- src/spreadinterp.cpp | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 717946a32..12f8e4670 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1036,11 +1036,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // +-----------------------+ // |re|im|re|im|re|im|re|im| // +-----------------------+ - // const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); - const auto dd_pt = [dd, i]() constexpr noexcept { - const batch_t ddi{dd[i * 2]}, ddj{dd[i * 2 + 1]}; - return xsimd::zip_lo(ddi, ddj); - }(); + const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index // FLT(i1) has different semantics and results an extra cast @@ -1209,12 +1205,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - // const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + - // 1]); - const auto dd_pt = [dd, pt]() constexpr noexcept { - const batch_t ddi{dd[pt * 2]}, ddj{dd[pt * 2 + 1]}; - return xsimd::zip_lo(ddi, ddj); - }(); + const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); @@ -1318,12 +1309,7 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width std::fill(du, du + 2 * size1 * size2 * size3, 0); for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - // const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + - // 1]); - const auto dd_pt = [dd, pt]() constexpr noexcept { - const batch_t ddi{dd[pt * 2]}, ddj{dd[pt * 2 + 1]}; - return xsimd::zip_lo(ddi, ddj); - }(); + const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); From c4cc9a5c826fbe85fee848c6621ff8e5c0eecf5f Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Mon, 17 Jun 2024 12:33:44 -0400 Subject: [PATCH 16/35] Batching the zs allows to not have pipeline stalls when using the broadcast --- src/spreadinterp.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 12f8e4670..b07128d02 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -722,16 +722,18 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ static constexpr auto horner_coeffs = get_horner_coeffs_200(); alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); - alignas(alignment) const std::array pow_z = - [](const FLT z) constexpr noexcept { - std::array zs_v{}; - auto sz = z; - for (uint8_t i = 0; i < nc - 1; ++i) { - zs_v[i] = batch_t(sz); - sz *= z; - } - return zs_v; - }(z); + const std::array pow_z = [](const FLT z) constexpr noexcept { + std::array zs{}; + std::array zs_v{}; + zs[0] = z; + for (uint8_t i = 1; i < nc - 1; ++i) { + zs[i] = zs[i - 1] * z; + } + for (uint8_t i = 0; i < nc - 1; ++i) { + zs_v[i] = batch_t::broadcast(zs[i]); + } + return zs_v; + }(z); for (uint8_t i = 0; i < w; i += avx_size) { auto k = batch_t::load_aligned(padded_coeffs[0].data() + i); for (uint8_t j = 1; j < nc; ++j) { From fcfe3dde7826c0f9e835126603ef5b91675a6f73 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Tue, 18 Jun 2024 13:55:34 -0400 Subject: [PATCH 17/35] perftest/compare_spreads.jl with w-sweeps all dims, barplots --- perftest/compare_spreads.jl | 94 +++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 perftest/compare_spreads.jl diff --git a/perftest/compare_spreads.jl b/perftest/compare_spreads.jl new file mode 100644 index 000000000..46a1d8ed4 --- /dev/null +++ b/perftest/compare_spreads.jl @@ -0,0 +1,94 @@ +# compare two spreadtest executables at variety of dim, w{prec}. Barnett 6/17/24 +using Printf +using CairoMakie +using JLD2 # for load/save arrays to file +using UnPack + +fnam = "results/master-vs-svec2_gcc114_5700U_nthr8" # outfile head +# locations of pair of FINUFFT repos to compare... +repo1 = "/home/alex/numerics/finufft" +repo2 = "/home/alex/numerics/nufft/finufft-svec2" + +# run spreadtestnd{f} for a list of tols at one prec +# return spread & interp times as 2-by-ntols +function run_spread(repo,dim,M,N,tols,nthr,prec) + if prec==Float64 + exec = "$repo/perftest/spreadtestnd" + elseif prec==Float32 + exec = "$repo/perftest/spreadtestndf" + else error("prec not known!") + end + times = zeros(2,length(tols)) # spread col 1; interp col 2 + for (i,tol) in enumerate(tols) + nr = 3 # repetitions + sptruns = zeros(nr) + intruns = zeros(nr) + for r=1:nr + c = Cmd(`$exec $dim $M $N $tol`,env=("OMP_NUM_THREADS" => "$nthr",)) + r==1 && println(c) # first run show entire Cmd not just strings + o = read(c,String) # do the cmd (no shell launched, as SGJ likes) + sptruns[r] = parse(Float64,split(split(o,"pts in")[2],"s")[1]) # get first timing (spread) in seconds + intruns[r] = parse(Float64,split(split(o,"pts in")[3],"s")[1]) # get 2nd timing (interp) in seconds + end + times[:,i] = [minimum(sptruns), minimum(intruns)] + end + times +end +#ts = run_spread(repo2,1,1e7,1e6,[1e-2,1e-3,1e-5],1,Float32) # basic test +#println(ts); stop + +# plots (and to PNG) all dims and w{prec} for both directions +function plot_all(fnam,ts,wstr,dims,M,N,nthr) + ntols = length(wstr) + repos = stack([[1,2] for i=1:ntols]) # which repo each run was from + for dir=1:2 + dirstr = ["spread","interp"][dir] + fig = Figure(fontsize=10, size=(1000,500)) # plot all 3 dims + for (i,dim) in enumerate(dims) + thrus = 1e-6 * M ./ ts[dir,i,:,:]' # slice for this dir, dim: interleave repo1, repo2, repo1,... + ax = Axis(fig[1,i], title="$dirstr $(dim)d M=$M N=$N $(nthr)thr") # fnam too long + barplot!(ax, kron(1:ntols, [1,1]), thrus[:], dodge=repos[:], color=repos[:]) + ax.xticks=(1:ntols, wstr) + ax.xlabel="w{prec}"; ax.ylabel=L"throughput ($10^6$ NU pt/s)" + ax.limits=((0,ntols+1),(0,nothing)) + yadd = maximum(thrus[:]) # what height to annotate % at + for j=1:ntols # show % change + text!(j+0.4, yadd, text=@sprintf("%.0f%%",100*(thrus[2,j]/thrus[1,j]-1.0)), rotation=pi/2) + end + end + display(fig) + save("$(fnam)_$(dirstr)_M$(M)_N$(N).png",fig) + end +end + +# main script........................................................................... +nthr = 8; # 1: leave cpu freq at max (4.3GHz); for 8, lower to 2.7GHz since drops to this. +# set freq lim with cpupower-gui +# check with: watch -n 1 sort -nr /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq +dims = 1:3 +M=1e7; N=1e6 +#tolsd = [1e-3,1e-6]; tolsf = [1e-2] # double then single prec tol lists +tolsd = [10.0^-k for k=2:14]; tolsf = [10.0^-k for k=2:5]; +compute = true #false +if compute + ntolsd = length(tolsd); ntolsf = length(tolsf) + ntols = ntolsd+ntolsf + ndims = length(dims) + ts = NaN*zeros(2,ndims,ntols,2) # timings: 1st dim = spread,interp, 4th dim = repo# + for (i,dim) in enumerate(dims) # do expensive runs... + ts[:,i,1:ntolsd,1] = run_spread(repo1,dim,M,N,tolsd,nthr,Float64) + ts[:,i,1:ntolsd,2] = run_spread(repo2,dim,M,N,tolsd,nthr,Float64) + ts[:,i,ntolsd+1:end,1] = run_spread(repo1,dim,M,N,tolsf,nthr,Float32) + ts[:,i,ntolsd+1:end,2] = run_spread(repo2,dim,M,N,tolsf,nthr,Float32) + println(ts[:,i,:,:]) + end + #tolstr = [[@sprintf "%.0e" tol for tol=tolsd]; [@sprintf "%.0ef" tol for tol=tolsf]] + # strings for w (nspread) for plotting... + wstr = [[@sprintf "%d" -log10(tol)+1 for tol=tolsd]; [@sprintf "%df" -log10(tol)+1 for tol=tolsf]] + jldsave("$(fnam).jld2"; fnam,ts,wstr,dims,M,N,tolsd,tolsf,nthr) # save all + plot_all(fnam,ts,wstr,dims,M,N,nthr) +else + f = load("$(fnam).jld2"); # gives a dict + @unpack fnam,ts,wstr,dims,M,N,nthr = f # not very easy way to get dict into globals + plot_all(fnam,ts,wstr,dims,M,N,nthr) +end From c7b2e8e8f3901e647b8872b4bb55492cc8d2d543 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 18 Jun 2024 15:57:57 -0400 Subject: [PATCH 18/35] simd horner --- src/ker_horner_allw_loop_constexpr.h | 1132 +++++--------------------- src/spreadinterp.cpp | 27 +- 2 files changed, 236 insertions(+), 923 deletions(-) diff --git a/src/ker_horner_allw_loop_constexpr.h b/src/ker_horner_allw_loop_constexpr.h index a29fea019..077773038 100644 --- a/src/ker_horner_allw_loop_constexpr.h +++ b/src/ker_horner_allw_loop_constexpr.h @@ -2,915 +2,233 @@ // Authors: Alex Barnett & Ludvig af Klinteberg. // (C) The Simons Foundation, Inc. #include - template constexpr auto nc200() noexcept { return w + 2 + (w <= 8); } template constexpr std::array, nc200()> get_horner_coeffs_200() noexcept { - constexpr auto nc = nc200(); - if constexpr (w == 2) { - return std::array, nc>{ - {{4.5147043243215315E+01, 4.5147043243215300E+01}, - {5.7408070938221300E+01, -5.7408070938221293E+01}, - {-1.8395117920046484E+00, -1.8395117920046560E+00}, - {-2.0382426253182082E+01, 2.0382426253182086E+01}, - {-2.0940804433577420E+00, -2.0940804433577389E+00}}}; - } else if constexpr (w == 3) { - return std::array, nc>{ - {{1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02}, - {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02}, - {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02}, - {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01}, - {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01}, - {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00}}}; - } else if constexpr (w == 4) { - return std::array, nc>{ - {{5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, - 5.4284366850213223E+02}, - {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, - -1.4650917259256937E+03}, - {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, - 1.4186910680718347E+03}, - {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, - -5.1133995502497424E+02}, - {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, - -4.8293622641174061E+01}, - {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, - 7.8386867802392359E+01}, - {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, - -1.0039212571700640E+01}}}; - } else if constexpr (w == 5) { - return std::array, nc>{ - {{9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, - 3.7794697666613283E+04, 9.9223677575398403E+02}, - {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, - -3.7938404259811381E+04, -3.0430174925083829E+03}, - {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, - 7.7501368899498730E+03, 3.6092689177271218E+03}, - {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, - 3.8875294641277369E+03, -1.9990077310495412E+03}, - {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, - -1.5861137916762643E+03, 4.0071733590403909E+02}, - {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, - -1.2316471075214508E+02, 9.1301168206167233E+01}, - {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, - 1.1960590540262307E+02, -5.5339722671223605E+01}, - {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, - -2.2839981872943818E+00, 3.3762488150341459E+00}}}; - } else if constexpr (w == 6) { - return std::array, nc>{ - {{2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, - 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03}, - {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, - -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}, - {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, - -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04}, - {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, - 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03}, - {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, - 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03}, - {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, - -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02}, - {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, - -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02}, - {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, - 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01}, - {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, - -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00}}}; - } else if constexpr (w == 7) { - return std::array, nc>{ - {{3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, - 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, - 3.9948351830642519E+03}, - {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, - -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, - -1.5290160332958067E+04}, - {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, - -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, - 2.4458227486795113E+04}, - {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, - 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, - -2.1166189345866893E+04}, - {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, - 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, - 1.0542795672361213E+04}, - {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, - 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, - -2.7903491906078298E+03}, - {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, - -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, - 1.6069721419533221E+02}, - {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, - 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, - 1.2289277375319763E+02}, - {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, - 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, - -3.2270164900224913E+01}, - {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, - 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, - 1.4761410890866353E-01}}}; - } else if constexpr (w == 8) { - return std::array, nc>{ - {{7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, - 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, - 1.7297637497600049E+06, 7.3898000697447915E+03}, - {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, - 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, - -3.1853145713323941E+06, -3.0719636811267606E+04}, - {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, - -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, - 2.4101183255475126E+06, 5.4488498478251728E+04}, - {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, - -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, - -9.0469037926849339E+05, -5.3926359802542138E+04}, - {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, - 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, - 1.3079802224392109E+05, 3.2444118016247590E+04}, - {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, - 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, - 2.2700360645707628E+04, -1.1864306345505294E+04}, - {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, - -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, - -1.1569135767377924E+04, 2.2812256770903286E+03}, - {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, - -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, - 9.7513976461209836E+02, -8.5503535637013552E+00}, - {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, - 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, - 2.8246898554219217E+02, -1.0230637348345138E+02}, - {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, - 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, - -6.1692257626845532E+01, 1.9200143062947120E+01}, - {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, - -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, - -1.7334408840526812E+00, 3.7894993760636758E-01}}}; - } else if constexpr (w == 9) { - return std::array, nc>{ - {{1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, - 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, - 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04}, - {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, - 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, - -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04}, - {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, - -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, - 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05}, - {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, - -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, - -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05}, - {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, - -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, - -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04}, - {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, - 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, - 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04}, - {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, - 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, - -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04}, - {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, - -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, - -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03}, - {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, - -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, - 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01}, - {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, - 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, - 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01}, - {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, - 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, - -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00}}}; - } else if constexpr (w == 10) { - return std::array, nc>{{{ - 2.2594586605749264E+04, - 1.3595989066786593E+07, - 4.4723032442444897E+08, - 3.3781755837397518E+09, - 8.6836783895849819E+09, - 8.6836783895849762E+09, - 3.3781755837397494E+09, - 4.4723032442444897E+08, - 1.3595989066786474E+07, - 2.2594586605749344E+04, - }, - { - 1.0729981697645642E+05, - 3.0651490267742988E+07, - 5.9387966085130465E+08, - 2.4434902657508330E+09, - 2.0073077861288922E+09, - -2.0073077861288943E+09, - -2.4434902657508330E+09, - -5.9387966085130453E+08, - -3.0651490267742816E+07, - -1.0729981697645638E+05, - }, - { - 2.2340399734184606E+05, - 3.0258214643190462E+07, - 3.1512411458738232E+08, - 4.3618276932319808E+08, - -7.8178848450497293E+08, - -7.8178848450497019E+08, - 4.3618276932319826E+08, - 3.1512411458738232E+08, - 3.0258214643190313E+07, - 2.2340399734184548E+05, - }, - { - 2.6917433004353486E+05, - 1.6875651476661228E+07, - 7.4664745481963441E+07, - -9.5882157211118385E+07, - -2.0622994435532519E+08, - 2.0622994435532743E+08, - 9.5882157211118177E+07, - -7.4664745481963515E+07, - -1.6875651476661161E+07, - -2.6917433004353428E+05, - }, - { - 2.0818422772177903E+05, - 5.6084730690362519E+06, - 1.4435118192351763E+06, - -4.0063869969544649E+07, - 3.2803674392747045E+07, - 3.2803674392746095E+07, - -4.0063869969546899E+07, - 1.4435118192351642E+06, - 5.6084730690362034E+06, - 2.0818422772177853E+05, - }, - { - 1.0781139496011091E+05, - 9.9202615851199068E+05, - -3.3266265543962116E+06, - -4.8557049011479173E+05, - 1.0176155522772279E+07, - -1.0176155522772269E+07, - 4.8557049011678610E+05, - 3.3266265543963453E+06, - -9.9202615851196018E+05, - -1.0781139496011072E+05, - }, - { - 3.7380102688153558E+04, - 1.2716675000355666E+04, - -6.2163527451774501E+05, - 1.4157962667184104E+06, - -8.4419693137680157E+05, - -8.4419693137743860E+05, - 1.4157962667189445E+06, - -6.2163527451771160E+05, - 1.2716675000340010E+04, - 3.7380102688153442E+04, - }, - { - 8.1238936393894646E+03, - -3.4872365530450072E+04, - 2.3913680325196314E+04, - 1.2428850301830019E+05, - -3.2158255329716846E+05, - 3.2158255329951923E+05, - -1.2428850301867779E+05, - -2.3913680325277423E+04, - 3.4872365530457188E+04, - -8.1238936393894255E+03, - }, - { - 7.8515926628982663E+02, - -6.6607899119372642E+03, - 2.0167398338513311E+04, - -2.8951401344519112E+04, - 1.4622828142848679E+04, - 1.4622828143544031E+04, - -2.8951401346900999E+04, - 2.0167398338398041E+04, - -6.6607899119505255E+03, - 7.8515926628967964E+02, - }, - { - -1.0147176570537010E+02, - -3.5304284185385157E+01, - 1.3576976854876134E+03, - -4.3921059353471856E+03, - 7.3232085271125388E+03, - -7.3232085273978546E+03, - 4.3921059367737662E+03, - -1.3576976854043962E+03, - 3.5304284185385157E+01, - 1.0147176570550941E+02, - }, - { - -4.3161545259389186E+01, - 1.5498490981579428E+02, - -3.1771250774232175E+02, - 3.7215448796427023E+02, - -1.7181762832770994E+02, - -1.7181763036843782E+02, - 3.7215448789408123E+02, - -3.1771250773692140E+02, - 1.5498490982186786E+02, - -4.3161545259547800E+01, - }, - { - -4.2916172038214198E+00, - 1.7402146071148604E+01, - -4.7947588069135868E+01, - 9.2697698088029625E+01, - -1.2821427596894478E+02, - 1.2821427705670308E+02, - -9.2697698297776569E+01, - 4.7947588093524907E+01, - -1.7402146074502035E+01, - 4.2916172038452141E+00, - }}}; - } else if constexpr (w == 11) { - return std::array, nc>{{{ - 3.7794653219809625E+04, - 3.4782300224660739E+07, - 1.6188020733727551E+09, - 1.7196758809615005E+10, - 6.3754384857724617E+10, - 9.7196447559193497E+10, - 6.3754384857724617E+10, - 1.7196758809614998E+10, - 1.6188020733727560E+09, - 3.4782300224660769E+07, - 3.7794653219808984E+04, - }, - { - 1.8969206922085886E+05, - 8.4769319065313652E+07, - 2.4230555767723408E+09, - 1.5439732722639101E+10, - 2.7112836839612309E+10, - 2.5609833368650835E-06, - -2.7112836839612328E+10, - -1.5439732722639105E+10, - -2.4230555767723408E+09, - -8.4769319065313682E+07, - -1.8969206922085711E+05, - }, - { - 4.2138380313901440E+05, - 9.2050522922791913E+07, - 1.5259983101266613E+09, - 4.7070559561237173E+09, - -1.2448027572952359E+09, - -1.0161446790279301E+10, - -1.2448027572952316E+09, - 4.7070559561237268E+09, - 1.5259983101266615E+09, - 9.2050522922791913E+07, - 4.2138380313901149E+05, - }, - { - 5.4814313598122005E+05, - 5.8085130777589552E+07, - 4.9484006166551048E+08, - 1.6222124676640952E+08, - -2.0440440381345339E+09, - 9.1416457449079640E-06, - 2.0440440381345336E+09, - -1.6222124676640788E+08, - -4.9484006166551071E+08, - -5.8085130777589560E+07, - -5.4814313598121714E+05, - }, - { - 4.6495183529254980E+05, - 2.3067199578027144E+07, - 6.9832590192482382E+07, - -2.2024799260683522E+08, - -1.2820270942588677E+08, - 5.1017181199129778E+08, - -1.2820270942588474E+08, - -2.2024799260683942E+08, - 6.9832590192482322E+07, - 2.3067199578027155E+07, - 4.6495183529254742E+05, - }, - { - 2.7021781043532980E+05, - 5.6764510325100143E+06, - -5.5650761736748898E+06, - -3.9907385617900200E+07, - 7.2453390663687646E+07, - 1.2300109686762266E-05, - -7.2453390663684472E+07, - 3.9907385617899075E+07, - 5.5650761736749066E+06, - -5.6764510325099993E+06, - -2.7021781043532846E+05, - }, - { - 1.0933249308680627E+05, - 6.9586821127987828E+05, - -3.6860240321937902E+06, - 2.7428169457736355E+06, - 8.3392008440593518E+06, - -1.6402201025046850E+07, - 8.3392008440698013E+06, - 2.7428169457778852E+06, - -3.6860240321937371E+06, - 6.9586821127989423E+05, - 1.0933249308680571E+05, - }, - { - 3.0203516161820498E+04, - -3.6879059542768438E+04, - -4.1141031216788280E+05, - 1.4111389975267777E+06, - -1.5914376635331670E+06, - 9.4095582602103753E-06, - 1.5914376635379130E+06, - -1.4111389975247320E+06, - 4.1141031216776522E+05, - 3.6879059542750314E+04, - -3.0203516161820549E+04, - }, - { - 5.1670143574922731E+03, - -2.8613147115372190E+04, - 4.3560195427081359E+04, - 4.8438679582765450E+04, - -2.5856630639231802E+05, - 3.7994883866738499E+05, - -2.5856630640319458E+05, - 4.8438679579510936E+04, - 4.3560195426766244E+04, - -2.8613147115376054E+04, - 5.1670143574922913E+03, - }, - { - 3.0888018539740131E+02, - -3.7949446187471626E+03, - 1.4313303204988082E+04, - -2.6681600235594462E+04, - 2.3856005166166615E+04, - 8.6424601730164351E-06, - -2.3856005155895236E+04, - 2.6681600234453199E+04, - -1.4313303205083188E+04, - 3.7949446187583080E+03, - -3.0888018539728523E+02, - }, - { - -8.3747489794189363E+01, - 1.1948077479405792E+02, - 4.8528498015072080E+02, - -2.5024391114755094E+03, - 5.3511195318669425E+03, - -6.7655484107390166E+03, - 5.3511195362291774E+03, - -2.5024391131167667E+03, - 4.8528498019392708E+02, - 1.1948077480620087E+02, - -8.3747489794426258E+01, - }, - { - -2.2640047135517630E+01, - 9.0840898563949466E+01, - -2.1597187544386938E+02, - 3.1511229111443720E+02, - -2.4856617998395282E+02, - 6.1683918215190516E-06, - 2.4856618439352349E+02, - -3.1511228757800421E+02, - 2.1597187557069353E+02, - -9.0840898570046704E+01, - 2.2640047135565219E+01, - }, - { - -1.6306382886201207E+00, - 7.3325946591320434E+00, - -2.3241017682854558E+01, - 5.1715494398901185E+01, - -8.2673000279130790E+01, - 9.6489719151212370E+01, - -8.2673010381149226E+01, - 5.1715494328769353E+01, - -2.3241018024860580E+01, - 7.3325946448852415E+00, - -1.6306382886460551E+00, - }}}; - } else if constexpr (w == 12) { - return std::array, nc>{ - {{6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, - 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, - 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, - 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04}, - {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, - 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, - -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, - -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}, - {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, - 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, - -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, - 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}, - {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, - 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, - 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, - -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}, - {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, - -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, - 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, - 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}, - {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, - -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, - -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, - -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}, - {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, - -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, - -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, - -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}, - {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, - 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, - 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, - 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}, - {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, - 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, - 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, - -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}, - {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, - -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, - -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, - -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}, - {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, - -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, - -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, - 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}, - {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, - -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, - 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, - -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}, - {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, - 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, - 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, - -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}, - {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, - 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, - -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, - 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}}}; - } else if constexpr (w == 13) { - return std::array, nc>{ - {{9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, - 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, - 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, - 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, - 9.8715725867496090E+04}, - {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, - 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, - -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, - -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, - -5.4491110456935526E+05}, - {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, - 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, - -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, - 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, - 1.3504711883426069E+06}, - {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, - 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, - 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, - -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, - -1.9937206140846489E+06}, - {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, - 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, - 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, - 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, - 1.9607419630386417E+06}, - {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, - -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, - 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, - 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, - -1.3593773865640305E+06}, - {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, - -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, - -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, - -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, - 6.8417206432039209E+05}, - {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, - -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, - 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, - 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, - -2.5248269397037517E+05}, - {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, - 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, - 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, - 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, - 6.7530100970876316E+04}, - {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, - 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, - 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, - -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, - -1.2421368748961073E+04}, - {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, - -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, - -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, - -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, - 1.2904654687545376E+03}, - {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, - -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, - 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, - 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, - 1.9043622268312891E+01}, - {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, - -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, - 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, - -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, - -3.0093984465884773E+01}, - {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, - 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, - 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, - -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, - 4.3050286009485790E+00}, - {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, - 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, - -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, - 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, - -1.0957333740315604E-01}}}; - } else if constexpr (w == 14) { - return std::array, nc>{ - {{1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, - 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, - 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, - 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, - 4.4723032442444855E+08, 1.5499533202970232E+05}, - {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, - 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, - 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, - -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, - -1.3065352538728662E+09, -8.9188339002979454E+05}, - {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, - 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, - -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, - 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, - 1.7532505043698275E+09, 2.3170473769380399E+06}, - {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, - 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, - -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, - -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, - -1.4278058213962219E+09, -3.6089249230396664E+06}, - {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, - 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, - 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, - -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, - 7.8376718099107552E+08, 3.7733555140852560E+06}, - {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, - -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, - 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, - 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, - -3.0340753492383808E+08, -2.8079157920112377E+06}, - {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, - -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, - -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, - -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, - 8.3513615594416171E+07, 1.5361613559533576E+06}, - {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, - -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, - -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, - -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, - -1.5741723594963137E+07, -6.2759409419590747E+05}, - {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, - 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, - 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, - 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, - 1.7156606891560503E+06, 1.9151404903936724E+05}, - {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, - 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, - 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, - 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, - 2.2565910606306688E+03, -4.2715272622820135E+04}, - {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, - 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, - -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, - -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, - -3.5474227033406372E+04, 6.4806786523010323E+03}, - {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, - -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, - -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, - 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, - 5.5416668533342381E+03, -4.9913632906195977E+02}, - {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, - -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, - 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, - 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, - -1.8970588700495102E+02, -3.3076333168231550E+01}, - {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, - -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, - 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, - -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, - -5.7000699100179844E+01, 1.4394533639240331E+01}, - {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, - 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, - -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, - -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, - 8.5113924808491728E+00, -1.5925952194145006E+00}, - {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, - 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, - -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, - 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, - -1.2876100566420701E-01, -1.5984859433053292E-02}}}; - } else if constexpr (w == 15) { - return std::array, nc>{ - {{2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, - 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, - 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, - 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, - 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05}, - {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, - 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, - 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, - -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, - -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06}, - {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, - 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, - -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, - 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, - 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06}, - {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, - 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, - -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, - 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, - -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06}, - {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, - 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, - -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, - -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, - 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06}, - {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, - 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, - 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, - 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, - -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06}, - {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, - -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, - 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, - 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, - 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06}, - {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, - -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, - -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, - -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, - -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06}, - {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, - -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, - -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, - -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, - -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05}, - {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, - 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, - 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, - 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, - 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05}, - {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, - 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, - 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, - -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, - -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04}, - {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, - 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, - -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, - -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, - -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03}, - {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, - -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, - -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, - 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, - 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02}, - {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, - -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, - 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, - 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, - -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01}, - {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, - 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, - 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, - -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, - -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00}, - {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, - 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, - -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, - -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, - 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01}, - {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, - 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, - -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, - 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, - -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}}}; - } else if constexpr (w == 16) { - return std::array, nc>{ - {{3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, - 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, - 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, - 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, - 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, - 3.6434551345571183E+05}, - {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, - 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, - 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, - -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, - -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, - -2.2576246485480373E+06}, - {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, - 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, - 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, - 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, - 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, - 6.3730995546265030E+06}, - {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, - 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, - -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, - 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, - -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, - -1.0896915393078227E+07}, - {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, - 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, - -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, - -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, - 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, - 1.2655725616100591E+07}, - {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, - 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, - 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, - -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, - -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, - -1.0609303958036322E+07}, - {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, - 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, - 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, - 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, - 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, - 6.6544809363384554E+06}, - {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, - -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, - -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, - 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, - 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, - -3.1906872142825015E+06}, - {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, - -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, - -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, - -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, - -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, - 1.1821527096621762E+06}, - {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, - -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, - -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, - 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, - 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, - -3.3854610744279937E+05}, - {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, - 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, - 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, - 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, - 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, - 7.3893334077307401E+04}, - {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, - 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, - 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, - -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, - -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, - -1.1778892113376129E+04}, - {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, - 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, - -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, - -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, - 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, - 1.2019749667911419E+03}, - {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, - -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, - -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, - 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, - 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, - -3.1189837631106176E+01}, - {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, - -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, - 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, - 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, - -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, - -1.2975319073977776E+01}, - {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, - 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, - 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, - -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, - -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, - 2.3155118723150525E+00}, - {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, - 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, - -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, - -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, - 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, - -1.5401723756214594E-01}, - {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, - 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, - -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, - 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, - -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, - -1.1808834826225629E-02}}}; - } else { - static_assert(w >= 2, "w must be >= 2"); - static_assert(w <= 16, "w must be <= 16"); - return {}; - } + constexpr auto nc = nc200(); + if constexpr (w == 2) { + return std::array, nc> {{ + {-2.0940804433577420E+00, -2.0940804433577389E+00}, + {-2.0382426253182082E+01, 2.0382426253182086E+01}, + {-1.8395117920046484E+00, -1.8395117920046560E+00}, + {5.7408070938221300E+01, -5.7408070938221293E+01}, + {4.5147043243215315E+01, 4.5147043243215300E+01} + }}; + } else if constexpr (w == 3) { + return std::array, nc> {{ + {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00}, + {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01}, + {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01}, + {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02}, + {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02}, + {1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02} + }}; + } else if constexpr (w == 4) { + return std::array, nc> {{ + {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, -1.0039212571700640E+01}, + {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, 7.8386867802392359E+01}, + {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, -4.8293622641174061E+01}, + {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, -5.1133995502497424E+02}, + {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, 1.4186910680718347E+03}, + {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, -1.4650917259256937E+03}, + {5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, 5.4284366850213223E+02} + }}; + } else if constexpr (w == 5) { + return std::array, nc> {{ + {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, -2.2839981872943818E+00, 3.3762488150341459E+00}, + {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, 1.1960590540262307E+02, -5.5339722671223605E+01}, + {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, -1.2316471075214508E+02, 9.1301168206167233E+01}, + {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, -1.5861137916762643E+03, 4.0071733590403909E+02}, + {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, 3.8875294641277369E+03, -1.9990077310495412E+03}, + {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, 7.7501368899498730E+03, 3.6092689177271218E+03}, + {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, -3.7938404259811381E+04, -3.0430174925083829E+03}, + {9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, 3.7794697666613283E+04, 9.9223677575398403E+02} + }}; + } else if constexpr (w == 6) { + return std::array, nc> {{ + {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00}, + {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01}, + {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02}, + {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02}, + {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03}, + {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03}, + {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04}, + {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03}, + {2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03} + }}; + } else if constexpr(w==7) { + return std::array, nc> {{ + {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, 1.4761410890866353E-01}, + {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, -3.2270164900224913E+01}, + {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, 1.2289277375319763E+02}, + {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, 1.6069721419533221E+02}, + {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, -2.7903491906078298E+03}, + {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, 1.0542795672361213E+04}, + {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, -2.1166189345866893E+04}, + {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, 2.4458227486795113E+04}, + {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, -1.5290160332958067E+04}, + {3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, 3.9948351830642519E+03} + }}; + } else if constexpr(w==8) { + return std::array, nc> {{ + {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, -1.7334408840526812E+00, 3.7894993760636758E-01}, + {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, -6.1692257626845532E+01, 1.9200143062947120E+01}, + {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, 2.8246898554219217E+02, -1.0230637348345138E+02}, + {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, 9.7513976461209836E+02, -8.5503535637013552E+00}, + {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, -1.1569135767377924E+04, 2.2812256770903286E+03}, + {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, 2.2700360645707628E+04, -1.1864306345505294E+04}, + {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, 1.3079802224392109E+05, 3.2444118016247590E+04}, + {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, -9.0469037926849339E+05, -5.3926359802542138E+04}, + {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}, + {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267606E+04}, + {7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, 1.7297637497600049E+06, 7.3898000697447915E+03} + }}; + } else if constexpr(w==9) { + return std::array, nc> {{ + {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00}, + {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01}, + {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01}, + {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03}, + {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04}, + {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04}, + {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04}, + {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05}, + {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05}, + {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04}, + {1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04} + }}; + } else if constexpr(w==10) { + return std::array, nc> {{ + {-4.2916172038214198E+00, 1.7402146071148604E+01, -4.7947588069135868E+01, 9.2697698088029625E+01, -1.2821427596894478E+02, 1.2821427705670308E+02, -9.2697698297776569E+01, 4.7947588093524907E+01, -1.7402146074502035E+01, 4.2916172038452141E+00,}, + {-4.3161545259389186E+01, 1.5498490981579428E+02, -3.1771250774232175E+02, 3.7215448796427023E+02, -1.7181762832770994E+02, -1.7181763036843782E+02, 3.7215448789408123E+02, -3.1771250773692140E+02, 1.5498490982186786E+02, -4.3161545259547800E+01,}, + {-1.0147176570537010E+02, -3.5304284185385157E+01, 1.3576976854876134E+03, -4.3921059353471856E+03, 7.3232085271125388E+03, -7.3232085273978546E+03, 4.3921059367737662E+03, -1.3576976854043962E+03, 3.5304284185385157E+01, 1.0147176570550941E+02,}, + {7.8515926628982663E+02, -6.6607899119372642E+03, 2.0167398338513311E+04, -2.8951401344519112E+04, 1.4622828142848679E+04, 1.4622828143544031E+04, -2.8951401346900999E+04, 2.0167398338398041E+04, -6.6607899119505255E+03, 7.8515926628967964E+02,}, + {8.1238936393894646E+03, -3.4872365530450072E+04, 2.3913680325196314E+04, 1.2428850301830019E+05, -3.2158255329716846E+05, 3.2158255329951923E+05, -1.2428850301867779E+05, -2.3913680325277423E+04, 3.4872365530457188E+04, -8.1238936393894255E+03,}, + {3.7380102688153558E+04, 1.2716675000355666E+04, -6.2163527451774501E+05, 1.4157962667184104E+06, -8.4419693137680157E+05, -8.4419693137743860E+05, 1.4157962667189445E+06, -6.2163527451771160E+05, 1.2716675000340010E+04, 3.7380102688153442E+04,}, + {1.0781139496011091E+05, 9.9202615851199068E+05, -3.3266265543962116E+06, -4.8557049011479173E+05, 1.0176155522772279E+07, -1.0176155522772269E+07, 4.8557049011678610E+05, 3.3266265543963453E+06, -9.9202615851196018E+05, -1.0781139496011072E+05,}, + {2.0818422772177903E+05, 5.6084730690362519E+06, 1.4435118192351763E+06, -4.0063869969544649E+07, 3.2803674392747045E+07, 3.2803674392746095E+07, -4.0063869969546899E+07, 1.4435118192351642E+06, 5.6084730690362034E+06, 2.0818422772177853E+05,}, + {2.6917433004353486E+05, 1.6875651476661228E+07, 7.4664745481963441E+07, -9.5882157211118385E+07, -2.0622994435532519E+08, 2.0622994435532743E+08, 9.5882157211118177E+07, -7.4664745481963515E+07, -1.6875651476661161E+07, -2.6917433004353428E+05,}, + {2.2340399734184606E+05, 3.0258214643190462E+07, 3.1512411458738232E+08, 4.3618276932319808E+08, -7.8178848450497293E+08, -7.8178848450497019E+08, 4.3618276932319826E+08, 3.1512411458738232E+08, 3.0258214643190313E+07, 2.2340399734184548E+05,}, + {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130465E+08, 2.4434902657508330E+09, 2.0073077861288922E+09, -2.0073077861288943E+09, -2.4434902657508330E+09, -5.9387966085130453E+08, -3.0651490267742816E+07, -1.0729981697645638E+05,}, + {2.2594586605749264E+04, 1.3595989066786593E+07, 4.4723032442444897E+08, 3.3781755837397518E+09, 8.6836783895849819E+09, 8.6836783895849762E+09, 3.3781755837397494E+09, 4.4723032442444897E+08, 1.3595989066786474E+07, 2.2594586605749344E+04,} + }}; + } else if constexpr(w==11) { + return std::array, nc> {{ + {-1.6306382886201207E+00, 7.3325946591320434E+00, -2.3241017682854558E+01, 5.1715494398901185E+01, -8.2673000279130790E+01, 9.6489719151212370E+01, -8.2673010381149226E+01, 5.1715494328769353E+01, -2.3241018024860580E+01, 7.3325946448852415E+00, -1.6306382886460551E+00,}, + {-2.2640047135517630E+01, 9.0840898563949466E+01, -2.1597187544386938E+02, 3.1511229111443720E+02, -2.4856617998395282E+02, 6.1683918215190516E-06, 2.4856618439352349E+02, -3.1511228757800421E+02, 2.1597187557069353E+02, -9.0840898570046704E+01, 2.2640047135565219E+01,}, + {-8.3747489794189363E+01, 1.1948077479405792E+02, 4.8528498015072080E+02, -2.5024391114755094E+03, 5.3511195318669425E+03, -6.7655484107390166E+03, 5.3511195362291774E+03, -2.5024391131167667E+03, 4.8528498019392708E+02, 1.1948077480620087E+02, -8.3747489794426258E+01,}, + {3.0888018539740131E+02, -3.7949446187471626E+03, 1.4313303204988082E+04, -2.6681600235594462E+04, 2.3856005166166615E+04, 8.6424601730164351E-06, -2.3856005155895236E+04, 2.6681600234453199E+04, -1.4313303205083188E+04, 3.7949446187583080E+03, -3.0888018539728523E+02,}, + {5.1670143574922731E+03, -2.8613147115372190E+04, 4.3560195427081359E+04, 4.8438679582765450E+04, -2.5856630639231802E+05, 3.7994883866738499E+05, -2.5856630640319458E+05, 4.8438679579510936E+04, 4.3560195426766244E+04, -2.8613147115376054E+04, 5.1670143574922913E+03,}, + {3.0203516161820498E+04, -3.6879059542768438E+04, -4.1141031216788280E+05, 1.4111389975267777E+06, -1.5914376635331670E+06, 9.4095582602103753E-06, 1.5914376635379130E+06, -1.4111389975247320E+06, 4.1141031216776522E+05, 3.6879059542750314E+04, -3.0203516161820549E+04,}, + {1.0933249308680627E+05, 6.9586821127987828E+05, -3.6860240321937902E+06, 2.7428169457736355E+06, 8.3392008440593518E+06, -1.6402201025046850E+07, 8.3392008440698013E+06, 2.7428169457778852E+06, -3.6860240321937371E+06, 6.9586821127989423E+05, 1.0933249308680571E+05,}, + {2.7021781043532980E+05, 5.6764510325100143E+06, -5.5650761736748898E+06, -3.9907385617900200E+07, 7.2453390663687646E+07, 1.2300109686762266E-05, -7.2453390663684472E+07, 3.9907385617899075E+07, 5.5650761736749066E+06, -5.6764510325099993E+06, -2.7021781043532846E+05,}, + {4.6495183529254980E+05, 2.3067199578027144E+07, 6.9832590192482382E+07, -2.2024799260683522E+08, -1.2820270942588677E+08, 5.1017181199129778E+08, -1.2820270942588474E+08, -2.2024799260683942E+08, 6.9832590192482322E+07, 2.3067199578027155E+07, 4.6495183529254742E+05,}, + {5.4814313598122005E+05, 5.8085130777589552E+07, 4.9484006166551048E+08, 1.6222124676640952E+08, -2.0440440381345339E+09, 9.1416457449079640E-06, 2.0440440381345336E+09, -1.6222124676640788E+08, -4.9484006166551071E+08, -5.8085130777589560E+07, -5.4814313598121714E+05,}, + {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266613E+09, 4.7070559561237173E+09, -1.2448027572952359E+09, -1.0161446790279301E+10, -1.2448027572952316E+09, 4.7070559561237268E+09, 1.5259983101266615E+09, 9.2050522922791913E+07, 4.2138380313901149E+05,}, + {1.8969206922085886E+05, 8.4769319065313652E+07, 2.4230555767723408E+09, 1.5439732722639101E+10, 2.7112836839612309E+10, 2.5609833368650835E-06, -2.7112836839612328E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05,}, + {3.7794653219809625E+04, 3.4782300224660739E+07, 1.6188020733727551E+09, 1.7196758809615005E+10, 6.3754384857724617E+10, 9.7196447559193497E+10, 6.3754384857724617E+10, 1.7196758809614998E+10, 1.6188020733727560E+09, 3.4782300224660769E+07, 3.7794653219808984E+04,} + }}; + } else if constexpr(w==12) { + return std::array, nc> {{ + {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}, + {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}, + {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}, + {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}, + {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}, + {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}, + {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}, + {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}, + {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}, + {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}, + {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}, + {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}, + {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}, + {6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04} + }}; + } else if constexpr(w==13) { + return std::array, nc> {{ + {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, -1.0957333740315604E-01}, + {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, 4.3050286009485790E+00}, + {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, -3.0093984465884773E+01}, + {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, 1.9043622268312891E+01}, + {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, 1.2904654687545376E+03}, + {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, -1.2421368748961073E+04}, + {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, 6.7530100970876316E+04}, + {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, -2.5248269397037517E+05}, + {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, 6.8417206432039209E+05}, + {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, -1.3593773865640305E+06}, + {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, 1.9607419630386417E+06}, + {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, -1.9937206140846489E+06}, + {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, 1.3504711883426069E+06}, + {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, -5.4491110456935526E+05}, + {9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, 9.8715725867496090E+04} + }}; + } else if constexpr(w==14) { + return std::array, nc> {{ + {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, -1.2876100566420701E-01, -1.5984859433053292E-02}, + {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, 8.5113924808491728E+00, -1.5925952194145006E+00}, + {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, -5.7000699100179844E+01, 1.4394533639240331E+01}, + {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, -1.8970588700495102E+02, -3.3076333168231550E+01}, + {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, 5.5416668533342381E+03, -4.9913632906195977E+02}, + {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, -3.5474227033406372E+04, 6.4806786523010323E+03}, + {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, 2.2565910606306688E+03, -4.2715272622820135E+04}, + {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, 1.7156606891560503E+06, 1.9151404903936724E+05}, + {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, -1.5741723594963137E+07, -6.2759409419590747E+05}, + {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, 8.3513615594416171E+07, 1.5361613559533576E+06}, + {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, -3.0340753492383808E+08, -2.8079157920112377E+06}, + {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, 7.8376718099107552E+08, 3.7733555140852560E+06}, + {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, -1.4278058213962219E+09, -3.6089249230396664E+06}, + {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, 1.7532505043698275E+09, 2.3170473769380399E+06}, + {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, -1.3065352538728662E+09, -8.9188339002979454E+05}, + {1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, 4.4723032442444855E+08, 1.5499533202970232E+05} + }}; + } else if constexpr(w==15) { + return std::array, nc> {{ + {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03}, + {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01}, + {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00}, + {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01}, + {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02}, + {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03}, + {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04}, + {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05}, + {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05}, + {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06}, + {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06}, + {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06}, + {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06}, + {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06}, + {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06}, + {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06}, + {2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05} + }}; + } else if constexpr(w==16) { + return std::array, nc> {{ + {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, -1.1808834826225629E-02}, + {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, -1.5401723756214594E-01}, + {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, 2.3155118723150525E+00}, + {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, -1.2975319073977776E+01}, + {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, -3.1189837631106176E+01}, + {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, 1.2019749667911419E+03}, + {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, -1.1778892113376129E+04}, + {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, 7.3893334077307401E+04}, + {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, -3.3854610744279937E+05}, + {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, 1.1821527096621762E+06}, + {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, -3.1906872142825015E+06}, + {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, 6.6544809363384554E+06}, + {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, -1.0609303958036322E+07}, + {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, 1.2655725616100591E+07}, + {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, -1.0896915393078227E+07}, + {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, 6.3730995546265030E+06}, + {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, -2.2576246485480373E+06}, + {3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, 3.6434551345571183E+05} + }}; + } else { + static_assert(w >= 2, "w must be >= 2"); + static_assert(w <= 16, "w must be <= 16"); + return {}; + } }; + + diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index b07128d02..653f0327a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -720,25 +720,17 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ static constexpr auto padded_ns = (w + avx_size - 1) & ~(avx_size - 1); static constexpr auto nc = nc200(); static constexpr auto horner_coeffs = get_horner_coeffs_200(); + alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); - const std::array pow_z = [](const FLT z) constexpr noexcept { - std::array zs{}; - std::array zs_v{}; - zs[0] = z; - for (uint8_t i = 1; i < nc - 1; ++i) { - zs[i] = zs[i - 1] * z; - } - for (uint8_t i = 0; i < nc - 1; ++i) { - zs_v[i] = batch_t::broadcast(zs[i]); - } - return zs_v; - }(z); + + const auto zv = batch_t(z); + for (uint8_t i = 0; i < w; i += avx_size) { auto k = batch_t::load_aligned(padded_coeffs[0].data() + i); for (uint8_t j = 1; j < nc; ++j) { const auto cji = batch_t::load_aligned(padded_coeffs[j].data() + i); - k = xsimd::fma(cji, pow_z[j - 1], k); + k = xsimd::fma(k, zv, cji); } k.store_aligned(ker + i); } @@ -1048,7 +1040,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // This can only happen if the overall error would be O(1) anyway. Clip x1?? if (x1 < -ns2) x1 = -ns2; if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** - alignas(alignment) const auto ker = ker_eval(opts, x1); + //alignas(alignment) const auto ker = ker_eval(opts, x1); + const auto &ker = ker_eval(opts, x1); const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du @@ -1213,7 +1206,8 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - alignas(alignment) const auto kernel_values = + //alignas(alignment) const auto kernel_values = + const auto &kernel_values = ker_eval(opts, x1, x2); alignas(alignment) auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); alignas(alignment) auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; @@ -1320,7 +1314,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - alignas(alignment) const auto kernel_values = + //alignas(alignment) const auto kernel_values = + const auto &kernel_values = ker_eval(opts, x1, x2, x3); auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; From 9e7efd2d6dda3243043438fbe84e3e944c072ce4 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Tue, 18 Jun 2024 16:43:56 -0400 Subject: [PATCH 19/35] revert const auto & --- src/spreadinterp.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 653f0327a..ec289caa9 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1040,8 +1040,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // This can only happen if the overall error would be O(1) anyway. Clip x1?? if (x1 < -ns2) x1 = -ns2; if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** - //alignas(alignment) const auto ker = ker_eval(opts, x1); - const auto &ker = ker_eval(opts, x1); + alignas(alignment) const auto ker = ker_eval(opts, x1); + //const auto &ker = ker_eval(opts, x1); const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du @@ -1206,8 +1206,8 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - //alignas(alignment) const auto kernel_values = - const auto &kernel_values = + alignas(alignment) const auto kernel_values = + //const auto &kernel_values = ker_eval(opts, x1, x2); alignas(alignment) auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); alignas(alignment) auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; @@ -1314,8 +1314,8 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - //alignas(alignment) const auto kernel_values = - const auto &kernel_values = + alignas(alignment) const auto kernel_values = + //const auto &kernel_values = ker_eval(opts, x1, x2, x3); auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; From e5aeb5b8867a0359d812eed24c00719fd14c2c43 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Thu, 20 Jun 2024 14:01:06 -0400 Subject: [PATCH 20/35] add triqs user --- docs/users.rst | 4 +++- perftest/compare_spreads.jl | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/users.rst b/docs/users.rst index d0a53f8e6..3cfe8c3b4 100644 --- a/docs/users.rst +++ b/docs/users.rst @@ -44,7 +44,9 @@ and also add them to GitHub's Used By feature): #. `nifty-ls `_: Fast evaluation of the Lomb-Scargle periodogram for time series analysis, backed by finufft or cufinufft - +#. `TRIQS CTINT `_: continous time interaction-expansion solver, by N. Wentzell and O. Parcollet (Flatiron Institute, part of platform for interacting quantum systems). + + Other wrappers to (cu)FINUFFT ------------------------------ diff --git a/perftest/compare_spreads.jl b/perftest/compare_spreads.jl index 46a1d8ed4..f7dec6db1 100644 --- a/perftest/compare_spreads.jl +++ b/perftest/compare_spreads.jl @@ -4,7 +4,7 @@ using CairoMakie using JLD2 # for load/save arrays to file using UnPack -fnam = "results/master-vs-svec2_gcc114_5700U_nthr8" # outfile head +fnam = "results/master-vs-svec2l_gcc114_5700U_nthr8" # outfile head # locations of pair of FINUFFT repos to compare... repo1 = "/home/alex/numerics/finufft" repo2 = "/home/alex/numerics/nufft/finufft-svec2" From ac78f49404b978fcc0e0571f7644f1641ae4b050 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 12:42:21 -0400 Subject: [PATCH 21/35] vectorized horner in interp --- src/spreadinterp.cpp | 201 +++++++++++++++++++++++-------------------- 1 file changed, 110 insertions(+), 91 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index ec289caa9..e568c920e 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -55,24 +55,27 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset template()>, typename... V> -static FINUFFT_ALWAYS_INLINE auto ker_eval(const finufft_spread_opts &opts, - const V... elems) noexcept; +static auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, + const V... elems) noexcept; static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N) noexcept; static FINUFFT_ALWAYS_INLINE void set_kernel_args( FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( FLT *ker, FLT *args, const finufft_spread_opts &opts, int N) noexcept; -static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( - FLT *ker, FLT x, int w, const finufft_spread_opts &opts) noexcept; template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; -static void interp_line(FLT *out, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns); -static void interp_square(FLT *out, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2, - BIGINT N1, BIGINT N2, int ns); -static void interp_cube(FLT *out, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1, - BIGINT i2, BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns); +template +static void interp_line(FLT *FINUFFT_RESTRICT out, const FLT *du, const FLT *ker, + BIGINT i1, BIGINT N1); +template +static void interp_square(FLT *FINUFFT_RESTRICT out, const FLT *du, const FLT *ker1, + const FLT *ker2, BIGINT i1, BIGINT i2, BIGINT N1, BIGINT N2); +template +static void interp_cube(FLT *FINUFFT_RESTRICT out, const FLT *du, const FLT *ker1, + const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, + BIGINT N1, BIGINT N2, BIGINT N3); static void spread_subproblem_1d(BIGINT off1, BIGINT size1, FLT *du0, BIGINT M0, FLT *kx0, FLT *dd0, const finufft_spread_opts &opts) noexcept; static void spread_subproblem_2d(BIGINT off1, BIGINT off2, BIGINT size1, BIGINT size2, @@ -373,7 +376,7 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat #pragma omp parallel num_threads(nthr) { // local copies of NU pts and data for each subproblem - std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; + std::vector kx0{0}, ky0{0}, kz0{0}, dd0{0}, du0{0}; #pragma omp for schedule(dynamic, 1) // each is big for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem @@ -437,16 +440,22 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat }; // -------------------------------------------------------------------------- -int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, - BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - finufft_spread_opts opts, int did_sort) +template +int interpSorted_kernel(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + FLT *data_nonuniform, finufft_spread_opts opts, int did_sort) // Interpolate to NU pts in sorted order from a uniform grid. // See spreadinterp() for doc. { + using batch_t = xsimd::batch; + using arch_t = typename batch_t::arch_type; + static constexpr auto padding = get_padding(); + static constexpr auto alignment = batch_t::arch_type::alignment(); + static constexpr auto avx_size = batch_t::size; + static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift + CNTime timer; int ndims = ndims_from_Ns(N1, N2, N3); - int ns = opts.nspread; // abbrev. for w, kernel width - FLT ns2 = (FLT)ns / 2; // half spread width, used as stencil shift int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit #ifndef _OPENMP @@ -455,7 +464,6 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat if (opts.debug) printf("\tinterp %dD (M=%lld; N1=%lld,N2=%lld,N3=%lld), nthr=%d\n", ndims, (long long)M, (long long)N1, (long long)N2, (long long)N3, nthr); - timer.start(); #pragma omp parallel num_threads(nthr) { @@ -464,16 +472,16 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat FLT xjlist[CHUNKSIZE], yjlist[CHUNKSIZE], zjlist[CHUNKSIZE]; FLT outbuf[2 * CHUNKSIZE]; // Kernels: static alloc is faster, so we do it for up to 3D... - FLT kernel_args[3 * MAX_NSPREAD]; - FLT kernel_values[3 * MAX_NSPREAD]; - FLT *ker1 = kernel_values; - FLT *ker2 = kernel_values + ns; - FLT *ker3 = kernel_values + 2 * ns; + alignas(alignment) std::array kernel_values{0}; + FLT *FINUFFT_RESTRICT ker1 = kernel_values.data(); + FLT *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + FLT *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; // Loop over interpolation chunks #pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: - for (BIGINT i = 0; i < M; i += CHUNKSIZE) // main loop over NU targs, interp each from + for (BIGINT i = 0; i < M; i += CHUNKSIZE) // main loop over NU trgs, interp each from // U + { // Setup buffers for this chunk int bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; @@ -504,31 +512,19 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // eval kernel values patch and use to interpolate from uniform data... if (!(opts.flags & TF_OMIT_SPREADING)) { - - if (opts.kerevalmeth == 0) { // choose eval method - set_kernel_args(kernel_args, x1, opts); - if (ndims > 1) set_kernel_args(kernel_args + ns, x2, opts); - if (ndims > 2) set_kernel_args(kernel_args + 2 * ns, x3, opts); - - evaluate_kernel_vector(kernel_values, kernel_args, opts, ndims * ns); - } - - else { - eval_kernel_vec_Horner(ker1, x1, ns, opts); - if (ndims > 1) eval_kernel_vec_Horner(ker2, x2, ns, opts); - if (ndims > 2) eval_kernel_vec_Horner(ker3, x3, ns, opts); - } - switch (ndims) { case 1: - interp_line(target, data_uniform, ker1, i1, N1, ns); + ker_eval(kernel_values.data(), opts, x1); + interp_line(target, data_uniform, ker1, i1, N1); break; case 2: - interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, N2, ns); + ker_eval(kernel_values.data(), opts, x1, x2); + interp_square(target, data_uniform, ker1, ker2, i1, i2, N1, N2); break; case 3: - interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2, N3, - ns); + ker_eval(kernel_values.data(), opts, x1, x2, x3); + interp_cube(target, data_uniform, ker1, ker2, ker3, i1, i2, i3, N1, N2, + N3); break; default: // can't get here break; @@ -547,7 +543,47 @@ int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat } // end parallel section if (opts.debug) printf("\tt2 spreading loop: \t%.3g s\n", timer.elapsedsec()); return 0; -}; +} + +template +int interpSorted_dispatch(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + FLT *data_nonuniform, finufft_spread_opts opts, int did_sort) { + static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, + "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); + if constexpr (NS == MIN_NSPREAD) { // Base case + if (opts.kerevalmeth) + return interpSorted_kernel(sort_indices, N1, N2, N3, + data_uniform, M, kx, ky, kz, + data_nonuniform, opts, did_sort); + else { + return interpSorted_kernel(sort_indices, N1, N2, N3, + data_uniform, M, kx, ky, kz, + data_nonuniform, opts, did_sort); + } + } else { + if (opts.nspread == NS) { + if (opts.kerevalmeth) { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts, did_sort); + } else { + return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, + kx, ky, kz, data_nonuniform, opts, + did_sort); + } + } else { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, + ky, kz, data_nonuniform, opts, did_sort); + } + } +} + +int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, + BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, + finufft_spread_opts opts, int did_sort) { + return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, + ky, kz, data_nonuniform, opts, did_sort); +} /////////////////////////////////////////////////////////////////////////// @@ -742,27 +778,10 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ return; } } -void eval_kernel_vec_Horner(FLT *ker, const FLT x, const int w, - const finufft_spread_opts &opts) noexcept -/* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at - x_j = x + j, for j=0,..,w-1. Thus x in [-w/2,-w/2+1]. w is aka ns. - This is the current evaluation method, since it's faster (except i7 w=16). - Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ -{ - if (!(opts.flags & TF_OMIT_EVALUATE_KERNEL)) { - const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in - // [-1,1] - // insert the auto-generated code which expects z, w args, writes to ker... - if (opts.upsampfac == 2.0) { // floating point equality is fine here -#include "ker_horner_allw_loop.c" - } else if (opts.upsampfac == 1.25) { -#include "ker_lowupsampfac_horner_allw_loop.c" - } else - fprintf(stderr, "%s: unknown upsampfac, failed!\n", __func__); - } -} -void interp_line(FLT *target, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns) +template +void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, BIGINT i1, + BIGINT N1) /* 1D interpolate complex values from size-ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the 1d kernel evaluation list ker1. @@ -818,8 +837,9 @@ void interp_line(FLT *target, FLT *du, FLT *ker, BIGINT i1, BIGINT N1, int ns) target[1] = out[1]; } -void interp_square(FLT *target, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT i2, - BIGINT N1, BIGINT N2, int ns) +template +void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, + const FLT *ker2, BIGINT i1, BIGINT i2, BIGINT N1, BIGINT N2) /* 2D interpolate complex values from a ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns outer product of the 1d kernel lists ker1 and ker2. @@ -897,8 +917,10 @@ void interp_square(FLT *target, FLT *du, FLT *ker1, FLT *ker2, BIGINT i1, BIGINT target[1] = out[1]; } -void interp_cube(FLT *target, FLT *du, FLT *ker1, FLT *ker2, FLT *ker3, BIGINT i1, - BIGINT i2, BIGINT i3, BIGINT N1, BIGINT N2, BIGINT N3, int ns) +template +void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, + const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, + BIGINT N1, BIGINT N2, BIGINT N3) /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3. @@ -1014,13 +1036,14 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width // something weird here. Reversing ker{0} and std fill causes ker // to be zeroed inside the loop GCC uses AVX, clang AVX2 + alignas(alignment) std::array ker{0}; std::fill(du, du + 2 * size1, 0); // zero output // no padding needed if MAX_NSPREAD is 16 // the largest read is 16 floats with avx512 // if larger instructions will be available or half precision is used, this should be // padded for (uint64_t i{0}; i < M; i++) { // loop over NU pts - // lamda here to return a dd_pt that is const + // initializes a dd_pt that is const // should not make a difference in performance // but is a hint to the compiler that after the lambda // dd_pt is not modified and can be kept as is in a register @@ -1040,16 +1063,16 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // This can only happen if the overall error would be O(1) anyway. Clip x1?? if (x1 < -ns2) x1 = -ns2; if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** - alignas(alignment) const auto ker = ker_eval(opts, x1); - //const auto &ker = ker_eval(opts, x1); - const auto j = i1 - off1; // offset rel to subgrid, starts the output indices + ker_eval(ker.data(), opts, x1); + // const auto ker = ker_eval(opts, x1); + const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du - // ker0 is also padded. + // ker is also padded. // regular_part is the largest multiple of 2*ns minus the remainder modulo // (2*avx_size). This allows to save one load. // see below for the details. - // adding padding to guarantee that all the elments are computed + // adding padding to guarantee that all the elements are computed // this trick only works when avx_size is a power of 2 // avx_size*2 is guaranteed to be a power of 2, trivially static constexpr auto regular_part = (2 * ns + padding) & (-(2 * avx_size)); @@ -1198,6 +1221,7 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( // Kernel values stored in consecutive memory. This allows us to compute // values in all three directions in a single kernel evaluation call. static constexpr auto ns2 = ns * FLT(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); @@ -1206,11 +1230,10 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - alignas(alignment) const auto kernel_values = - //const auto &kernel_values = - ker_eval(opts, x1, x2); - alignas(alignment) auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); - alignas(alignment) auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + // alignas(alignment) const auto kernel_values = + ker_eval(kernel_values.data(), opts, x1, x2); + const auto *ker1 = kernel_values.data(); + const auto *ker2 = kernel_values.data() + MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop // here 2* is because of complex static constexpr uint8_t batches = (2 * ns + padding) / avx_size; @@ -1303,8 +1326,9 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( static constexpr auto avx_size = batch_t::size; static constexpr auto alignment = batch_t::arch_type::alignment(); static constexpr auto ns2 = ns * FLT(0.5); // half spread width + alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2 * size3, 0); - for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices @@ -1314,12 +1338,10 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - alignas(alignment) const auto kernel_values = - //const auto &kernel_values = - ker_eval(opts, x1, x2, x3); - auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); - auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; - auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; + ker_eval(kernel_values.data(), opts, x1, x2, x3); + const auto *ker1 = kernel_values.data(); + const auto *ker2 = kernel_values.data() + MAX_NSPREAD; + const auto *ker3 = kernel_values.data() + 2 * MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop // here 2* is because of complex // Batches is the number of SIMD iterations needed to compute all the elements @@ -1699,7 +1721,8 @@ FLT fold_rescale(const FLT x, const BIGINT N) noexcept { } template -auto ker_eval(const finufft_spread_opts &opts, const V... elems) noexcept { +auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, + const V... elems) noexcept { /* Utility function that allows to move the kernel evaluation outside the spreader for clarity Inputs are: ns = kernel width kerevalmeth = kernel evaluation method T = (single or double precision) type of the kernel batch_t = batch type for Horner @@ -1708,21 +1731,17 @@ auto ker_eval(const finufft_spread_opts &opts, const V... elems) noexcept { kerevalmeth>(opts, x, y, z) // for 3D or ker_eval(opts, x, y) // for 2D or ker_eval(opts, x) // for 1D */ - alignas(batch_t::arch_type::alignment()) std::array - ker{0}; const std::array inputs{elems...}; // compile time loop, no performance overhead for (auto i = 0; i < sizeof...(elems); ++i) { // compile time branch no performance overhead if constexpr (kerevalmeth == 1) { - eval_kernel_vec_Horner(ker.data() + (i * MAX_NSPREAD), inputs[i], - opts); + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], opts); } if constexpr (kerevalmeth == 0) { alignas(batch_t::arch_type::alignment()) std::array kernel_args{}; set_kernel_args(kernel_args.data(), inputs[i], opts); - evaluate_kernel_vector(ker.data() + (i * MAX_NSPREAD), kernel_args.data(), opts, - ns); + evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts, ns); } } return ker; From 07b250188dad75e666ed55eefef6d31d9ef12a3f Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 19:19:05 -0400 Subject: [PATCH 22/35] Fixed some comments --- devel/padding.cpp | 78 ++--- makefile | 2 +- src/ker_horner_allw_loop.c | 207 -------------- src/ker_lowupsampfac_horner_allw_loop.c | 192 ------------- src/spreadinterp.cpp | 363 +++++++++++++----------- 5 files changed, 243 insertions(+), 599 deletions(-) delete mode 100644 src/ker_horner_allw_loop.c delete mode 100644 src/ker_lowupsampfac_horner_allw_loop.c diff --git a/devel/padding.cpp b/devel/padding.cpp index e8b337bfd..95ace9e12 100644 --- a/devel/padding.cpp +++ b/devel/padding.cpp @@ -8,7 +8,7 @@ template static constexpr auto BestSIMDHelper(); -template static constexpr auto GetPaddedSIMDSize(); +template static constexpr auto GetPaddedSIMDWidth(); template static uint16_t get_padding(uint16_t ns); @@ -17,11 +17,11 @@ template static constexpr auto get_padding(); template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template static constexpr uint16_t min_batch_size(); +template static constexpr uint16_t min_simd_width(); -template()> constexpr uint16_t max_batch_size(); +template()> constexpr uint16_t max_simd_width(); -template static constexpr auto find_optimal_batch_size(); +template static constexpr auto find_optimal_simd_width(); // below there is some trickery to obtain the padded SIMD type to vectorize // the given number of elements. @@ -37,26 +37,26 @@ template static constexpr auto BestSIMDHelper() } } -template constexpr uint16_t min_batch_size() { +template constexpr uint16_t min_simd_width() { if constexpr (std::is_void_v>) { - return min_batch_size(); + return min_simd_width(); } else { return N; } }; -template constexpr uint16_t max_batch_size() { +template constexpr uint16_t max_simd_width() { if constexpr (!std::is_void_v>) { - return max_batch_size(); + return max_simd_width(); } else { return N; } }; -template static constexpr auto find_optimal_batch_size() { +template static constexpr auto find_optimal_simd_width() { uint16_t min_iterations = N; uint16_t optimal_batch_size = 1; - for (uint16_t batch_size = min_batch_size(); batch_size <= xsimd::batch::size; + for (uint16_t batch_size = min_simd_width(); batch_size <= xsimd::batch::size; batch_size *= 2) { uint16_t iterations = (N + batch_size - 1) / batch_size; if (iterations < min_iterations) { @@ -67,13 +67,13 @@ template static constexpr auto find_optimal_batch_size() { return optimal_batch_size; } -template static constexpr auto GetPaddedSIMDSize() { +template static constexpr auto GetPaddedSIMDWidth() { static_assert(N < 128); - return xsimd::make_sized_batch()>::type::size; + return xsimd::make_sized_batch()>::type::size; } template static constexpr auto get_padding() { - constexpr uint16_t width = GetPaddedSIMDSize(); + constexpr uint16_t width = GetPaddedSIMDWidth(); return ((ns + width - 1) & (-width)) - ns; } @@ -113,8 +113,8 @@ template constexpr uint16_t po2_in_between() { } template constexpr auto mixed_vectors() { - constexpr auto min_batch = min_batch_size(); - constexpr auto max_batch = max_batch_size(); + constexpr auto min_batch = min_simd_width(); + constexpr auto max_batch = max_simd_width(); // compute all the power of 2 between min_batch and max_batch std::array() + 1> batch_sizes{1}; @@ -145,13 +145,13 @@ template constexpr auto mixed_vectors() { int main(int argc, char *argv[]) { std::cout << "Min batch size for single precision is " - << uint64_t(min_batch_size()) << std::endl; + << uint64_t(min_simd_width()) << std::endl; std::cout << "Max batch size for single precision is " - << uint64_t(max_batch_size()) << std::endl; + << uint64_t(max_simd_width()) << std::endl; std::cout << "Min batch size for double precision is " - << uint64_t(min_batch_size()) << std::endl; + << uint64_t(min_simd_width()) << std::endl; std::cout << "Max batch size for double precision is " - << uint64_t(max_batch_size()) << std::endl; + << uint64_t(max_simd_width()) << std::endl; std::cout << "Best SIMD single precision" << std::endl; std::cout << "SIMD for " << 4 << " is " << uint64_t(BestSIMD::size) @@ -191,47 +191,47 @@ int main(int argc, char *argv[]) { std::cout << "Padded SIMD single precision" << std::endl; std::cout << "Padded SIMD for " << 4 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 6 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 10 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 12 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 15 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 18 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 22 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 26 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 30 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 32 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD double precision" << std::endl; std::cout << "Padded SIMD for " << 4 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 6 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 10 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 12 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 15 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 18 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 22 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 26 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 30 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "Padded SIMD for " << 32 << " is " - << uint64_t(GetPaddedSIMDSize()) << std::endl; + << uint64_t(GetPaddedSIMDWidth()) << std::endl; std::cout << "single precision" << std::endl; for (auto i = 2; i < 16; i++) { diff --git a/makefile b/makefile index 709b5cf3c..a3020ce0b 100644 --- a/makefile +++ b/makefile @@ -421,7 +421,7 @@ docker-wheel: define clone_repo @echo "Cloning repository $(1) at tag $(2) into directory $(3)" @if [ ! -d "$(3)" ]; then \ - git clone --branch $(2) $(1) $(3); \ + git clone --depth=1 --branch $(2) $(1) $(3); \ else \ cd $(3) && \ CURRENT_VERSION=$$(git describe --tags --abbrev=0) && \ diff --git a/src/ker_horner_allw_loop.c b/src/ker_horner_allw_loop.c deleted file mode 100644 index 2f93f3e78..000000000 --- a/src/ker_horner_allw_loop.c +++ /dev/null @@ -1,207 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - FLT c0[] = {4.5147043243215315E+01, 4.5147043243215300E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {5.7408070938221300E+01, -5.7408070938221293E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {-1.8395117920046484E+00, -1.8395117920046560E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {-2.0382426253182082E+01, 2.0382426253182086E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {-2.0940804433577420E+00, -2.0940804433577389E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==3) { - FLT c0[] = {1.5653991189315119E+02, 8.8006872410780295E+02, 1.5653991189967152E+02, 0.0000000000000000E+00}; - FLT c1[] = {3.1653018869611077E+02, 7.4325702843759617E-14, -3.1653018868907071E+02, 0.0000000000000000E+00}; - FLT c2[] = {1.7742692790454484E+02, -3.3149255274727801E+02, 1.7742692791117119E+02, 0.0000000000000000E+00}; - FLT c3[] = {-1.5357716116473156E+01, 9.5071486252033243E-15, 1.5357716122720193E+01, 0.0000000000000000E+00}; - FLT c4[] = {-3.7757583061523668E+01, 5.3222970968867315E+01, -3.7757583054647384E+01, 0.0000000000000000E+00}; - FLT c5[] = {-3.9654011076088804E+00, 1.8062124448285358E-13, 3.9654011139270540E+00, 0.0000000000000000E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==4) { - FLT c0[] = {5.4284366850213200E+02, 1.0073871433088398E+04, 1.0073871433088396E+04, 5.4284366850213223E+02}; - FLT c1[] = {1.4650917259256939E+03, 6.1905285583602863E+03, -6.1905285583602881E+03, -1.4650917259256937E+03}; - FLT c2[] = {1.4186910680718345E+03, -1.3995339862725591E+03, -1.3995339862725598E+03, 1.4186910680718347E+03}; - FLT c3[] = {5.1133995502497419E+02, -1.4191608683682996E+03, 1.4191608683682998E+03, -5.1133995502497424E+02}; - FLT c4[] = {-4.8293622641174039E+01, 3.9393732546135226E+01, 3.9393732546135816E+01, -4.8293622641174061E+01}; - FLT c5[] = {-7.8386867802392288E+01, 1.4918904800408930E+02, -1.4918904800408751E+02, 7.8386867802392359E+01}; - FLT c6[] = {-1.0039212571700894E+01, 5.0626747735616746E+00, 5.0626747735625512E+00, -1.0039212571700640E+01}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==5) { - FLT c0[] = {9.9223677575398392E+02, 3.7794697666613320E+04, 9.8715771010760494E+04, 3.7794697666613283E+04, 9.9223677575398403E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {3.0430174925083825E+03, 3.7938404259811403E+04, -1.1842989705877139E-11, -3.7938404259811381E+04, -3.0430174925083829E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {3.6092689177271222E+03, 7.7501368899498666E+03, -2.2704627332475000E+04, 7.7501368899498730E+03, 3.6092689177271218E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.9990077310495396E+03, -3.8875294641277296E+03, 9.7116927320010791E-12, 3.8875294641277369E+03, -1.9990077310495412E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {4.0071733590403869E+02, -1.5861137916762602E+03, 2.3839858699098645E+03, -1.5861137916762643E+03, 4.0071733590403909E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {-9.1301168206167262E+01, 1.2316471075214675E+02, 2.0698495299948402E-11, -1.2316471075214508E+02, 9.1301168206167233E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {-5.5339722671223846E+01, 1.1960590540261879E+02, -1.5249941358311668E+02, 1.1960590540262307E+02, -5.5339722671223605E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {-3.3762488150353924E+00, 2.2839981872948751E+00, 7.1884725699454154E-12, -2.2839981872943818E+00, 3.3762488150341459E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==6) { - FLT c0[] = {2.0553833234911876E+03, 1.5499537739913128E+05, 8.1177907023291115E+05, 8.1177907023291173E+05, 1.5499537739913136E+05, 2.0553833235005691E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {7.1269776034442639E+03, 2.0581923258843314E+05, 3.1559612614917674E+05, -3.1559612614917627E+05, -2.0581923258843317E+05, -7.1269776034341394E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.0023404568475091E+04, 9.0916650498360192E+04, -1.0095927514054619E+05, -1.0095927514054628E+05, 9.0916650498360177E+04, 1.0023404568484635E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {7.2536109410387417E+03, 4.8347162752602981E+03, -5.0512736602018522E+04, 5.0512736602018478E+04, -4.8347162752603008E+03, -7.2536109410297540E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {2.7021878300949752E+03, -7.8773465553972646E+03, 5.2105876478342780E+03, 5.2105876478343343E+03, -7.8773465553972710E+03, 2.7021878301048723E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {3.2120291706547636E+02, -1.8229189469936762E+03, 3.7928113414429808E+03, -3.7928113414427025E+03, 1.8229189469937312E+03, -3.2120291705638243E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {-1.2051267090537374E+02, 2.2400507411399673E+02, -1.2506575852541796E+02, -1.2506575852521925E+02, 2.2400507411398695E+02, -1.2051267089640181E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {-4.5977202613350237E+01, 1.1536880606853076E+02, -1.7819720186493959E+02, 1.7819720186497622E+02, -1.1536880606854736E+02, 4.5977202622148909E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {-1.5631081288842275E+00, 7.1037430591266115E-01, -6.9838401121429056E-02, -6.9838401186476856E-02, 7.1037430589285400E-01, -1.5631081203754575E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==7) { - FLT c0[] = {3.9948351830487481E+03, 5.4715865608590771E+05, 5.0196413492771760E+06, 9.8206709220713247E+06, 5.0196413492771825E+06, 5.4715865608590783E+05, 3.9948351830642519E+03, 0.0000000000000000E+00}; - FLT c1[] = {1.5290160332974696E+04, 8.7628248584320408E+05, 3.4421061790934438E+06, -2.6908159596373561E-10, -3.4421061790934461E+06, -8.7628248584320408E+05, -1.5290160332958067E+04, 0.0000000000000000E+00}; - FLT c2[] = {2.4458227486779251E+04, 5.3904618484139396E+05, 2.4315566181017534E+05, -1.6133959371974322E+06, 2.4315566181017453E+05, 5.3904618484139396E+05, 2.4458227486795113E+04, 0.0000000000000000E+00}; - FLT c3[] = {2.1166189345881645E+04, 1.3382732160223130E+05, -3.3113450969689694E+05, 6.9013724510092140E-10, 3.3113450969689724E+05, -1.3382732160223136E+05, -2.1166189345866893E+04, 0.0000000000000000E+00}; - FLT c4[] = {1.0542795672344864E+04, -7.0739172265098678E+03, -6.5563293056049893E+04, 1.2429734005960064E+05, -6.5563293056049602E+04, -7.0739172265098332E+03, 1.0542795672361213E+04, 0.0000000000000000E+00}; - FLT c5[] = {2.7903491906228419E+03, -1.0975382873973093E+04, 1.3656979541144799E+04, 7.7346408577822045E-10, -1.3656979541143772E+04, 1.0975382873973256E+04, -2.7903491906078298E+03, 0.0000000000000000E+00}; - FLT c6[] = {1.6069721418053300E+02, -1.5518707872251393E+03, 4.3634273936642621E+03, -5.9891976420595174E+03, 4.3634273936642730E+03, -1.5518707872251064E+03, 1.6069721419533221E+02, 0.0000000000000000E+00}; - FLT c7[] = {-1.2289277373867256E+02, 2.8583630927743314E+02, -2.8318194617327981E+02, 6.9043515551118249E-10, 2.8318194617392436E+02, -2.8583630927760140E+02, 1.2289277375319763E+02, 0.0000000000000000E+00}; - FLT c8[] = {-3.2270164914249058E+01, 9.1892112257581346E+01, -1.6710678096334209E+02, 2.0317049305432383E+02, -1.6710678096383771E+02, 9.1892112257416159E+01, -3.2270164900224913E+01, 0.0000000000000000E+00}; - FLT c9[] = {-1.4761409685186277E-01, -9.1862771280377487E-01, 1.2845147741777752E+00, 5.6547359492808854E-10, -1.2845147728310689E+00, 9.1862771293147971E-01, 1.4761410890866353E-01, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==8) { - FLT c0[] = {7.3898000697447915E+03, 1.7297637497600035E+06, 2.5578341605285794E+07, 8.4789650417103335E+07, 8.4789650417103350E+07, 2.5578341605285816E+07, 1.7297637497600049E+06, 7.3898000697447915E+03}; - FLT c1[] = {3.0719636811267599E+04, 3.1853145713323927E+06, 2.3797981861403696E+07, 2.4569731244678464E+07, -2.4569731244678471E+07, -2.3797981861403704E+07, -3.1853145713323941E+06, -3.0719636811267606E+04}; - FLT c2[] = {5.4488498478251728E+04, 2.4101183255475131E+06, 6.4554051283428287E+06, -8.9200440393090546E+06, -8.9200440393090583E+06, 6.4554051283428324E+06, 2.4101183255475126E+06, 5.4488498478251728E+04}; - FLT c3[] = {5.3926359802542116E+04, 9.0469037926849292E+05, -6.0897036277696118E+05, -3.0743852105799988E+06, 3.0743852105800058E+06, 6.0897036277696711E+05, -9.0469037926849339E+05, -5.3926359802542138E+04}; - FLT c4[] = {3.2444118016247590E+04, 1.3079802224392134E+05, -5.8652889370129269E+05, 4.2333306008151924E+05, 4.2333306008152053E+05, -5.8652889370128722E+05, 1.3079802224392109E+05, 3.2444118016247590E+04}; - FLT c5[] = {1.1864306345505294E+04, -2.2700360645707988E+04, -5.0713607251414309E+04, 1.8308704458211688E+05, -1.8308704458210632E+05, 5.0713607251413123E+04, 2.2700360645707628E+04, -1.1864306345505294E+04}; - FLT c6[] = {2.2812256770903232E+03, -1.1569135767377773E+04, 2.0942387020798891E+04, -1.1661592834945191E+04, -1.1661592834940149E+04, 2.0942387020801420E+04, -1.1569135767377924E+04, 2.2812256770903286E+03}; - FLT c7[] = {8.5503535636821422E+00, -9.7513976461238224E+02, 3.8242995179171526E+03, -6.9201295567267280E+03, 6.9201295567248662E+03, -3.8242995179155446E+03, 9.7513976461209836E+02, -8.5503535637013552E+00}; - FLT c8[] = {-1.0230637348345023E+02, 2.8246898554269114E+02, -3.8638201738139219E+02, 1.9106407993320320E+02, 1.9106407993289886E+02, -3.8638201738492717E+02, 2.8246898554219217E+02, -1.0230637348345138E+02}; - FLT c9[] = {-1.9200143062947848E+01, 6.1692257626706223E+01, -1.2981109187842989E+02, 1.8681284210471688E+02, -1.8681284209654376E+02, 1.2981109187880142E+02, -6.1692257626845532E+01, 1.9200143062947120E+01}; - FLT c10[] = {3.7894993760177598E-01, -1.7334408836731494E+00, 2.5271184057877303E+00, -1.2600963971824484E+00, -1.2600963917834651E+00, 2.5271184069685657E+00, -1.7334408840526812E+00, 3.7894993760636758E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==9) { - FLT c0[] = {1.3136365370186100E+04, 5.0196413492771806E+06, 1.1303327711722563E+08, 5.8225443924996686E+08, 9.7700272582690656E+08, 5.8225443924996758E+08, 1.1303327711722568E+08, 5.0196413492772207E+06, 1.3136365370186135E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {5.8623313038274340E+04, 1.0326318537280345E+07, 1.2898448324824864E+08, 3.0522863709830385E+08, -3.9398045056223735E-08, -3.0522863709830391E+08, -1.2898448324824864E+08, -1.0326318537280388E+07, -5.8623313038274347E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.1335001341875963E+05, 9.0726133144784812E+06, 5.3501544534038112E+07, -2.6789524644146336E+05, -1.2483923718899371E+08, -2.6789524644172983E+05, 5.3501544534038112E+07, 9.0726133144785129E+06, 1.1335001341875960E+05, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.2489113703229747E+05, 4.3035547171861930E+06, 6.3021978510598792E+06, -2.6014941986659057E+07, 6.0417403157325170E-08, 2.6014941986659389E+07, -6.3021978510598652E+06, -4.3035547171862079E+06, -1.2489113703229751E+05, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {8.6425493435991244E+04, 1.0891182836653308E+06, -2.0713033564200639E+06, -2.8994941183506218E+06, 7.5905338661205899E+06, -2.8994941183505375E+06, -2.0713033564200667E+06, 1.0891182836653353E+06, 8.6425493435991288E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {3.8657354724013814E+04, 7.9936390113331305E+04, -7.0458265546791907E+05, 1.0151095605715880E+06, 1.2138090419648379E-07, -1.0151095605717725E+06, 7.0458265546794771E+05, -7.9936390113331567E+04, -3.8657354724013821E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {1.0779131453134638E+04, -3.3466718311300596E+04, -1.3245366619006139E+04, 1.8238470515353698E+05, -2.9285656292977190E+05, 1.8238470515350526E+05, -1.3245366619000662E+04, -3.3466718311299621E+04, 1.0779131453134616E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {1.4992527030548456E+03, -9.7024371533891372E+03, 2.3216330734057381E+04, -2.3465262819040818E+04, 5.3299736484284360E-08, 2.3465262819251962E+04, -2.3216330734049119E+04, 9.7024371533890644E+03, -1.4992527030548747E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {-7.9857427421129714E+01, -4.0585588534807385E+02, 2.6054813773472697E+03, -6.1806593581075495E+03, 8.0679596874001718E+03, -6.1806593581869265E+03, 2.6054813773147021E+03, -4.0585588535363172E+02, -7.9857427421126204E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {-7.1572272057937070E+01, 2.2785637019511205E+02, -3.9109820765665262E+02, 3.3597424711470910E+02, 1.0596763818009852E-07, -3.3597424723359080E+02, 3.9109820766854079E+02, -2.2785637019009673E+02, 7.1572272057939983E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {-9.8886360698074700E+00, 3.5359026949867051E+01, -8.5251867715709949E+01, 1.4285748012617628E+02, -1.6935269668779691E+02, 1.4285748010331625E+02, -8.5251867711661305E+01, 3.5359026944299828E+01, -9.8886360698207305E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==10) { - FLT c0[] = {2.2594586605749264E+04, 1.3595989066786593E+07, 4.4723032442444897E+08, 3.3781755837397518E+09, 8.6836783895849819E+09, 8.6836783895849762E+09, 3.3781755837397494E+09, 4.4723032442444897E+08, 1.3595989066786474E+07, 2.2594586605749344E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {1.0729981697645642E+05, 3.0651490267742988E+07, 5.9387966085130465E+08, 2.4434902657508330E+09, 2.0073077861288922E+09, -2.0073077861288943E+09, -2.4434902657508330E+09, -5.9387966085130453E+08, -3.0651490267742816E+07, -1.0729981697645638E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {2.2340399734184606E+05, 3.0258214643190462E+07, 3.1512411458738232E+08, 4.3618276932319808E+08, -7.8178848450497293E+08, -7.8178848450497019E+08, 4.3618276932319826E+08, 3.1512411458738232E+08, 3.0258214643190313E+07, 2.2340399734184548E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {2.6917433004353486E+05, 1.6875651476661228E+07, 7.4664745481963441E+07, -9.5882157211118385E+07, -2.0622994435532519E+08, 2.0622994435532743E+08, 9.5882157211118177E+07, -7.4664745481963515E+07, -1.6875651476661161E+07, -2.6917433004353428E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {2.0818422772177903E+05, 5.6084730690362519E+06, 1.4435118192351763E+06, -4.0063869969544649E+07, 3.2803674392747045E+07, 3.2803674392746095E+07, -4.0063869969546899E+07, 1.4435118192351642E+06, 5.6084730690362034E+06, 2.0818422772177853E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {1.0781139496011091E+05, 9.9202615851199068E+05, -3.3266265543962116E+06, -4.8557049011479173E+05, 1.0176155522772279E+07, -1.0176155522772269E+07, 4.8557049011678610E+05, 3.3266265543963453E+06, -9.9202615851196018E+05, -1.0781139496011072E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {3.7380102688153558E+04, 1.2716675000355666E+04, -6.2163527451774501E+05, 1.4157962667184104E+06, -8.4419693137680157E+05, -8.4419693137743860E+05, 1.4157962667189445E+06, -6.2163527451771160E+05, 1.2716675000340010E+04, 3.7380102688153442E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {8.1238936393894646E+03, -3.4872365530450072E+04, 2.3913680325196314E+04, 1.2428850301830019E+05, -3.2158255329716846E+05, 3.2158255329951923E+05, -1.2428850301867779E+05, -2.3913680325277423E+04, 3.4872365530457188E+04, -8.1238936393894255E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {7.8515926628982663E+02, -6.6607899119372642E+03, 2.0167398338513311E+04, -2.8951401344519112E+04, 1.4622828142848679E+04, 1.4622828143544031E+04, -2.8951401346900999E+04, 2.0167398338398041E+04, -6.6607899119505255E+03, 7.8515926628967964E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {-1.0147176570537010E+02, -3.5304284185385157E+01, 1.3576976854876134E+03, -4.3921059353471856E+03, 7.3232085271125388E+03, -7.3232085273978546E+03, 4.3921059367737662E+03, -1.3576976854043962E+03, 3.5304284185385157E+01, 1.0147176570550941E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {-4.3161545259389186E+01, 1.5498490981579428E+02, -3.1771250774232175E+02, 3.7215448796427023E+02, -1.7181762832770994E+02, -1.7181763036843782E+02, 3.7215448789408123E+02, -3.1771250773692140E+02, 1.5498490982186786E+02, -4.3161545259547800E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c11[] = {-4.2916172038214198E+00, 1.7402146071148604E+01, -4.7947588069135868E+01, 9.2697698088029625E+01, -1.2821427596894478E+02, 1.2821427705670308E+02, -9.2697698297776569E+01, 4.7947588093524907E+01, -1.7402146074502035E+01, 4.2916172038452141E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==11) { - FLT c0[] = {3.7794653219809625E+04, 3.4782300224660739E+07, 1.6188020733727551E+09, 1.7196758809615005E+10, 6.3754384857724617E+10, 9.7196447559193497E+10, 6.3754384857724617E+10, 1.7196758809614998E+10, 1.6188020733727560E+09, 3.4782300224660769E+07, 3.7794653219808984E+04, 0.0000000000000000E+00}; - FLT c1[] = {1.8969206922085886E+05, 8.4769319065313652E+07, 2.4230555767723408E+09, 1.5439732722639101E+10, 2.7112836839612309E+10, 2.5609833368650835E-06, -2.7112836839612328E+10, -1.5439732722639105E+10, -2.4230555767723408E+09, -8.4769319065313682E+07, -1.8969206922085711E+05, 0.0000000000000000E+00}; - FLT c2[] = {4.2138380313901440E+05, 9.2050522922791913E+07, 1.5259983101266613E+09, 4.7070559561237173E+09, -1.2448027572952359E+09, -1.0161446790279301E+10, -1.2448027572952316E+09, 4.7070559561237268E+09, 1.5259983101266615E+09, 9.2050522922791913E+07, 4.2138380313901149E+05, 0.0000000000000000E+00}; - FLT c3[] = {5.4814313598122005E+05, 5.8085130777589552E+07, 4.9484006166551048E+08, 1.6222124676640952E+08, -2.0440440381345339E+09, 9.1416457449079640E-06, 2.0440440381345336E+09, -1.6222124676640788E+08, -4.9484006166551071E+08, -5.8085130777589560E+07, -5.4814313598121714E+05, 0.0000000000000000E+00}; - FLT c4[] = {4.6495183529254980E+05, 2.3067199578027144E+07, 6.9832590192482382E+07, -2.2024799260683522E+08, -1.2820270942588677E+08, 5.1017181199129778E+08, -1.2820270942588474E+08, -2.2024799260683942E+08, 6.9832590192482322E+07, 2.3067199578027155E+07, 4.6495183529254742E+05, 0.0000000000000000E+00}; - FLT c5[] = {2.7021781043532980E+05, 5.6764510325100143E+06, -5.5650761736748898E+06, -3.9907385617900200E+07, 7.2453390663687646E+07, 1.2300109686762266E-05, -7.2453390663684472E+07, 3.9907385617899075E+07, 5.5650761736749066E+06, -5.6764510325099993E+06, -2.7021781043532846E+05, 0.0000000000000000E+00}; - FLT c6[] = {1.0933249308680627E+05, 6.9586821127987828E+05, -3.6860240321937902E+06, 2.7428169457736355E+06, 8.3392008440593518E+06, -1.6402201025046850E+07, 8.3392008440698013E+06, 2.7428169457778852E+06, -3.6860240321937371E+06, 6.9586821127989423E+05, 1.0933249308680571E+05, 0.0000000000000000E+00}; - FLT c7[] = {3.0203516161820498E+04, -3.6879059542768438E+04, -4.1141031216788280E+05, 1.4111389975267777E+06, -1.5914376635331670E+06, 9.4095582602103753E-06, 1.5914376635379130E+06, -1.4111389975247320E+06, 4.1141031216776522E+05, 3.6879059542750314E+04, -3.0203516161820549E+04, 0.0000000000000000E+00}; - FLT c8[] = {5.1670143574922731E+03, -2.8613147115372190E+04, 4.3560195427081359E+04, 4.8438679582765450E+04, -2.5856630639231802E+05, 3.7994883866738499E+05, -2.5856630640319458E+05, 4.8438679579510936E+04, 4.3560195426766244E+04, -2.8613147115376054E+04, 5.1670143574922913E+03, 0.0000000000000000E+00}; - FLT c9[] = {3.0888018539740131E+02, -3.7949446187471626E+03, 1.4313303204988082E+04, -2.6681600235594462E+04, 2.3856005166166615E+04, 8.6424601730164351E-06, -2.3856005155895236E+04, 2.6681600234453199E+04, -1.4313303205083188E+04, 3.7949446187583080E+03, -3.0888018539728523E+02, 0.0000000000000000E+00}; - FLT c10[] = {-8.3747489794189363E+01, 1.1948077479405792E+02, 4.8528498015072080E+02, -2.5024391114755094E+03, 5.3511195318669425E+03, -6.7655484107390166E+03, 5.3511195362291774E+03, -2.5024391131167667E+03, 4.8528498019392708E+02, 1.1948077480620087E+02, -8.3747489794426258E+01, 0.0000000000000000E+00}; - FLT c11[] = {-2.2640047135517630E+01, 9.0840898563949466E+01, -2.1597187544386938E+02, 3.1511229111443720E+02, -2.4856617998395282E+02, 6.1683918215190516E-06, 2.4856618439352349E+02, -3.1511228757800421E+02, 2.1597187557069353E+02, -9.0840898570046704E+01, 2.2640047135565219E+01, 0.0000000000000000E+00}; - FLT c12[] = {-1.6306382886201207E+00, 7.3325946591320434E+00, -2.3241017682854558E+01, 5.1715494398901185E+01, -8.2673000279130790E+01, 9.6489719151212370E+01, -8.2673010381149226E+01, 5.1715494328769353E+01, -2.3241018024860580E+01, 7.3325946448852415E+00, -1.6306382886460551E+00, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==12) { - FLT c0[] = {6.1722991679852908E+04, 8.4789650417103648E+07, 5.4431675199498701E+09, 7.8788892335272232E+10, 4.0355760945670044E+11, 8.8071481911347949E+11, 8.8071481911347961E+11, 4.0355760945670044E+11, 7.8788892335272430E+10, 5.4431675199498835E+09, 8.4789650417103708E+07, 6.1722991679871957E+04}; - FLT c1[] = {3.2561466099406168E+05, 2.2112758120210618E+08, 8.9911609880089817E+09, 8.3059508064200943E+10, 2.3965569143469864E+11, 1.6939286803305212E+11, -1.6939286803305203E+11, -2.3965569143469864E+11, -8.3059508064201080E+10, -8.9911609880089989E+09, -2.2112758120210618E+08, -3.2561466099404311E+05}; - FLT c2[] = {7.6621098001581512E+05, 2.6026568260310286E+08, 6.4524338253008652E+09, 3.3729904113826820E+10, 2.8555202212474091E+10, -6.8998572040731537E+10, -6.8998572040731445E+10, 2.8555202212474079E+10, 3.3729904113826824E+10, 6.4524338253008757E+09, 2.6026568260310274E+08, 7.6621098001583829E+05}; - FLT c3[] = {1.0657807616803218E+06, 1.8144472126890984E+08, 2.5524827004349842E+09, 5.2112383911371660E+09, -1.0268350564014645E+10, -1.4763245309081306E+10, 1.4763245309081314E+10, 1.0268350564014671E+10, -5.2112383911371059E+09, -2.5524827004349871E+09, -1.8144472126890984E+08, -1.0657807616803099E+06}; - FLT c4[] = {9.7829638830158755E+05, 8.2222351241519913E+07, 5.5676911894064474E+08, -4.8739037675427330E+08, -2.7153428193078227E+09, 2.5627633609246106E+09, 2.5627633609246163E+09, -2.7153428193078651E+09, -4.8739037675430620E+08, 5.5676911894064546E+08, 8.2222351241519868E+07, 9.7829638830161188E+05}; - FLT c5[] = {6.2536876825114002E+05, 2.4702814073680203E+07, 4.1488431554846466E+07, -2.9274790542418826E+08, 1.0742154109191516E+08, 6.2185168968032193E+08, -6.2185168968012476E+08, -1.0742154109184742E+08, 2.9274790542423087E+08, -4.1488431554843128E+07, -2.4702814073680237E+07, -6.2536876825112454E+05}; - FLT c6[] = {2.8527714307528478E+05, 4.6266378435690766E+06, -1.0665598090790771E+07, -2.6048960239891130E+07, 9.1597254427317813E+07, -5.9794495983264342E+07, -5.9794495983220413E+07, 9.1597254427343085E+07, -2.6048960239921503E+07, -1.0665598090794146E+07, 4.6266378435690673E+06, 2.8527714307530399E+05}; - FLT c7[] = {9.2873647411234080E+04, 3.6630046787425119E+05, -3.1271047224730137E+06, 4.8612412939252760E+06, 3.3820440907796426E+06, -1.6880127953704204E+07, 1.6880127953756198E+07, -3.3820440907614031E+06, -4.8612412938993908E+06, 3.1271047224752530E+06, -3.6630046787425695E+05, -9.2873647411217215E+04}; - FLT c8[] = {2.0817947751046438E+04, -5.5660303410315042E+04, -1.9519783923444615E+05, 1.0804817251338551E+06, -1.8264985852555393E+06, 9.7602844968061335E+05, 9.7602844962902542E+05, -1.8264985852963410E+06, 1.0804817251124913E+06, -1.9519783923503032E+05, -5.5660303410363231E+04, 2.0817947751063632E+04}; - FLT c9[] = {2.7986023314783361E+03, -1.9404411093655592E+04, 4.3922625000519314E+04, -7.6450317451901383E+03, -1.5273911974273989E+05, 3.3223441458516393E+05, -3.3223441441930021E+05, 1.5273911979752057E+05, 7.6450317512768806E+03, -4.3922624998141677E+04, 1.9404411093637758E+04, -2.7986023314644049E+03}; - FLT c10[] = {6.7849020474048089E+01, -1.7921351308204744E+03, 8.4980694686552797E+03, -1.9742624859769410E+04, 2.4620674845030797E+04, -1.1676544851227827E+04, -1.1676544869194569E+04, 2.4620674845030626E+04, -1.9742624831436660E+04, 8.4980694630406069E+03, -1.7921351308312935E+03, 6.7849020488592075E+01}; - FLT c11[] = {-5.4577020998836872E+01, 1.3637112867242237E+02, 4.5513616580246023E+01, -1.1174001367986359E+03, 3.2018769312434206E+03, -5.0580351396215219E+03, 5.0580351683422405E+03, -3.2018769242193171E+03, 1.1174000998831286E+03, -4.5513609243969356E+01, -1.3637112867730119E+02, 5.4577021011726984E+01}; - FLT c12[] = {-1.0538365872268786E+01, 4.6577222488645518E+01, -1.2606964198473415E+02, 2.1881091668968099E+02, -2.3273399614976032E+02, 1.0274275204276027E+02, 1.0274270265494516E+02, -2.3273401859852868E+02, 2.1881091865396468E+02, -1.2606964777237258E+02, 4.6577222453584369E+01, -1.0538365860573146E+01}; - FLT c13[] = {-4.6087004144309118E-01, 2.5969759128998060E+00, -9.6946932216381381E+00, 2.4990041962121211E+01, -4.6013909139329137E+01, 6.2056985032913090E+01, -6.2056925855365186E+01, 4.6013921000662158E+01, -2.4990037445376750E+01, 9.6946954085586885E+00, -2.5969759201692755E+00, 4.6087004744129911E-01}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==13) { - FLT c0[] = {9.8715725867495363E+04, 1.9828875496808097E+08, 1.7196758809614983E+10, 3.3083776881353577E+11, 2.2668873993375439E+12, 6.7734720591167568E+12, 9.6695220682534785E+12, 6.7734720591167432E+12, 2.2668873993375430E+12, 3.3083776881353503E+11, 1.7196758809614998E+10, 1.9828875496807891E+08, 9.8715725867496090E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {5.4491110456935549E+05, 5.4903670125539351E+08, 3.0879465445278183E+10, 3.9588436413399969E+11, 1.6860562536749778E+12, 2.4256447893117891E+12, -5.5583944938791784E-05, -2.4256447893117847E+12, -1.6860562536749768E+12, -3.9588436413399890E+11, -3.0879465445278183E+10, -5.4903670125538898E+08, -5.4491110456935526E+05, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.3504711883426071E+06, 6.9286979077463162E+08, 2.4618123595484577E+10, 1.9493985627722607E+11, 3.9422703517046350E+11, -1.8678883613919861E+11, -8.5538079834550110E+11, -1.8678883613919730E+11, 3.9422703517046375E+11, 1.9493985627722589E+11, 2.4618123595484566E+10, 6.9286979077462614E+08, 1.3504711883426069E+06, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.9937206140846491E+06, 5.2512029493765980E+08, 1.1253303793811750E+10, 4.6205527735932152E+10, -1.1607472377983305E+10, -1.6305241755642313E+11, 3.5385440504350348E-04, 1.6305241755642365E+11, 1.1607472377982582E+10, -4.6205527735932213E+10, -1.1253303793811750E+10, -5.2512029493765628E+08, -1.9937206140846489E+06, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {1.9607419630386413E+06, 2.6425362558103892E+08, 3.1171259341747193E+09, 2.9839860297839913E+09, -1.9585031917561897E+10, -5.0666917387065792E+09, 3.6568794485480583E+10, -5.0666917387057562E+09, -1.9585031917561817E+10, 2.9839860297838497E+09, 3.1171259341747184E+09, 2.6425362558103728E+08, 1.9607419630386417E+06, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {1.3593773865640305E+06, 9.1556445104158267E+07, 4.7074012944133747E+08, -1.1192579335657008E+09, -2.1090780087868555E+09, 5.2270306737951984E+09, 5.6467240041521856E-04, -5.2270306737934217E+09, 2.1090780087880819E+09, 1.1192579335658383E+09, -4.7074012944133127E+08, -9.1556445104157984E+07, -1.3593773865640305E+06, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {6.8417206432039209E+05, 2.1561705510027152E+07, 7.5785249893055111E+06, -2.7456096030221754E+08, 3.4589095671054310E+08, 4.0256106808894646E+08, -1.0074306926603404E+09, 4.0256106809081393E+08, 3.4589095670997137E+08, -2.7456096030236483E+08, 7.5785249893030487E+06, 2.1561705510027405E+07, 6.8417206432039209E+05, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {2.5248269397037517E+05, 3.0985559672616189E+06, -1.1816517087616559E+07, -8.2958498770184973E+06, 8.0546642347355247E+07, -1.0594657799485898E+08, 2.1816722293163801E-04, 1.0594657799424352E+08, -8.0546642347497791E+07, 8.2958498771036500E+06, 1.1816517087615721E+07, -3.0985559672621777E+06, -2.5248269397037517E+05, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {6.7530100970876694E+04, 1.2373362326658823E+05, -2.1245597183281910E+06, 5.1047323238754412E+06, -1.4139444405488928E+06, -1.1818267555096827E+07, 2.0121548578624789E+07, -1.1818267557079868E+07, -1.4139444401348191E+06, 5.1047323236516044E+06, -2.1245597183309775E+06, 1.2373362326702787E+05, 6.7530100970876316E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {1.2421368748961073E+04, -5.0576243647011936E+04, -4.8878193436902722E+04, 6.5307896872028301E+05, -1.5497610127060430E+06, 1.5137725917321201E+06, 4.1615986404011299E-04, -1.5137725918538549E+06, 1.5497610130469005E+06, -6.5307896856811445E+05, 4.8878193438804832E+04, 5.0576243646433126E+04, -1.2421368748961073E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {1.2904654687550299E+03, -1.1169946055009055E+04, 3.3275109713863385E+04, -3.1765222274236821E+04, -5.9810982085323274E+04, 2.2355863038592847E+05, -3.1083591705219547E+05, 2.2355863445202672E+05, -5.9810982721084511E+04, -3.1765222464963932E+04, 3.3275109714208855E+04, -1.1169946054555618E+04, 1.2904654687545376E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c11[] = {-1.9043622268674213E+01, -6.8296542209516542E+02, 4.2702512274202591E+03, -1.2165497317825058E+04, 1.9423733298269544E+04, -1.6010024066956401E+04, 3.4018642874429026E-04, 1.6010021599471667E+04, -1.9423732817821805E+04, 1.2165497483905752E+04, -4.2702512286689680E+03, 6.8296542153908558E+02, 1.9043622268312891E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c12[] = {-3.0093984465361217E+01, 9.8972865724808671E+01, -9.7437038666761538E+01, -3.5079928405373198E+02, 1.5699250566648977E+03, -3.1287439837941820E+03, 3.8692196309709061E+03, -3.1287462825615335E+03, 1.5699252631958864E+03, -3.5079944793112952E+02, -9.7437041893750632E+01, 9.8972866189610414E+01, -3.0093984465884773E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c13[] = {-4.3050286009489040E+00, 2.1108975724659501E+01, -6.4297198812570272E+01, 1.2922884632277874E+02, -1.6991812716212596E+02, 1.2655005901719436E+02, 9.2483537895948854E-05, -1.2655066232531748E+02, 1.6991805207569072E+02, -1.2922893667436634E+02, 6.4297198424711908E+01, -2.1108976207523057E+01, 4.3050286009485790E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c14[] = {-1.0957333716725008E-01, 7.2949317004436565E-01, -3.4300816058693728E+00, 1.0470054474579324E+01, -2.2292134950656113E+01, 3.4570827323582719E+01, -3.9923523442753932E+01, 3.4573264959502886E+01, -2.2292358612963266E+01, 1.0470042004916014E+01, -3.4300810538570281E+00, 7.2949352113279253E-01, -1.0957333740315604E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==14) { - FLT c0[] = {1.5499533202966207E+05, 4.4723032442444688E+08, 5.1495083701694740E+10, 1.2904576022918071E+12, 1.1534950432785506E+13, 4.5650102198520484E+13, 8.8830582190032641E+13, 8.8830582190032641E+13, 4.5650102198520492E+13, 1.1534950432785527E+13, 1.2904576022918074E+12, 5.1495083701695107E+10, 4.4723032442444855E+08, 1.5499533202970232E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {8.9188339002980455E+05, 1.3065352538728635E+09, 9.9400185225815567E+10, 1.7136059013402405E+12, 1.0144146621675832E+13, 2.3034036018490715E+13, 1.4630967270448871E+13, -1.4630967270448855E+13, -2.3034036018490719E+13, -1.0144146621675846E+13, -1.7136059013402405E+12, -9.9400185225815964E+10, -1.3065352538728662E+09, -8.9188339002979454E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {2.3170473769379663E+06, 1.7532505043698256E+09, 8.6523535958354309E+10, 9.7455289065487354E+11, 3.2977972139362314E+12, 1.7874626001697781E+12, -6.1480918082633916E+12, -6.1480918082633975E+12, 1.7874626001697690E+12, 3.2977972139362285E+12, 9.7455289065487329E+11, 8.6523535958354630E+10, 1.7532505043698275E+09, 2.3170473769380399E+06, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {3.6089249230396422E+06, 1.4278058213962190E+09, 4.4296625537022423E+10, 2.9466624630419781E+11, 3.1903621584503235E+11, -9.8834691411254565E+11, -1.1072264714919226E+12, 1.1072264714919316E+12, 9.8834691411255151E+11, -3.1903621584503467E+11, -2.9466624630419769E+11, -4.4296625537022621E+10, -1.4278058213962219E+09, -3.6089249230396664E+06, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {3.7733555140851745E+06, 7.8376718099107409E+08, 1.4443117772349569E+10, 4.3197433307418671E+10, -7.6585042240585556E+10, -1.8569640140763062E+11, 2.0385335192657199E+11, 2.0385335192656519E+11, -1.8569640140762662E+11, -7.6585042240580856E+10, 4.3197433307418686E+10, 1.4443117772349669E+10, 7.8376718099107552E+08, 3.7733555140852560E+06, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {2.8079157920112358E+06, 3.0340753492383724E+08, 2.9498136661747241E+09, -6.2820200387919831E+08, -2.2372008390623215E+10, 1.5217518660584890E+10, 4.0682590266891922E+10, -4.0682590266869431E+10, -1.5217518660582748E+10, 2.2372008390625935E+10, 6.2820200387968791E+08, -2.9498136661747637E+09, -3.0340753492383808E+08, -2.8079157920112377E+06, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {1.5361613559533111E+06, 8.3513615594416574E+07, 3.0077547202708024E+08, -1.3749596754067802E+09, -6.6733027297557127E+08, 5.9590333632819109E+09, -4.3025685566870070E+09, -4.3025685566872711E+09, 5.9590333632806673E+09, -6.6733027297523963E+08, -1.3749596754067125E+09, 3.0077547202709383E+08, 8.3513615594416171E+07, 1.5361613559533576E+06, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {6.2759409419592959E+05, 1.5741723594963098E+07, -1.5632610223406436E+07, -1.9294824907078514E+08, 4.4643806532434595E+08, 1.5178998385244830E+07, -9.6771139891725647E+08, 9.6771139892509627E+08, -1.5178998381042883E+07, -4.4643806533176166E+08, 1.9294824907065383E+08, 1.5632610223392555E+07, -1.5741723594963137E+07, -6.2759409419590747E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {1.9151404903933613E+05, 1.7156606891563335E+06, -9.7733523156688716E+06, 4.2982266233154163E+06, 5.1660907884347722E+07, -1.1279400211155911E+08, 6.4701089573962681E+07, 6.4701089571562663E+07, -1.1279400211012064E+08, 5.1660907891220264E+07, 4.2982266233826512E+06, -9.7733523157112263E+06, 1.7156606891560503E+06, 1.9151404903936724E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {4.2715272622845026E+04, -2.2565910611953568E+03, -1.1769776156959014E+06, 4.0078399907813077E+06, -3.8951858063335596E+06, -5.0944610754510267E+06, 1.6765992446914168E+07, -1.6765992426657490E+07, 5.0944610781778870E+06, 3.8951858062361716E+06, -4.0078399907326135E+06, 1.1769776157141617E+06, 2.2565910606306688E+03, -4.2715272622820135E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {6.4806786522793900E+03, -3.5474227032974472E+04, 1.8237100709385861E+04, 3.0934714629696816E+05, -1.0394703931686131E+06, 1.4743920333143482E+06, -7.3356882447856572E+05, -7.3356882916658197E+05, 1.4743920305501707E+06, -1.0394703929917105E+06, 3.0934714631908614E+05, 1.8237100665157792E+04, -3.5474227033406372E+04, 6.4806786523010323E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c11[] = {4.9913632908459954E+02, -5.5416668524952684E+03, 2.0614058717617296E+04, -3.2285139072943130E+04, -5.3099550821623425E+03, 1.1559000502166932E+05, -2.2569743259261423E+05, 2.2569743616896842E+05, -1.1559000130545651E+05, 5.3099543129458480E+03, 3.2285139142872020E+04, -2.0614058670790018E+04, 5.5416668533342381E+03, -4.9913632906195977E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c12[] = {-3.3076333188134086E+01, -1.8970588563697331E+02, 1.8160423493164808E+03, -6.3715703355644328E+03, 1.2525624574329036E+04, -1.4199806452802783E+04, 6.4441892296909591E+03, 6.4441909537524216E+03, -1.4199808176873401E+04, 1.2525626154733827E+04, -6.3715704433222418E+03, 1.8160422729911850E+03, -1.8970588700495102E+02, -3.3076333168231550E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c13[] = {-1.4394533627743886E+01, 5.7000699089242815E+01, -1.0101142663923416E+02, -3.2954197414395189E+01, 6.1417879182394654E+02, -1.6177283846697430E+03, 2.4593386157454975E+03, -2.4593322941165261E+03, 1.6177291239900730E+03, -6.1417952013923764E+02, 3.2954100943010943E+01, 1.0101142710333265E+02, -5.7000699100179844E+01, 1.4394533639240331E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c14[] = {-1.5925952284027161E+00, 8.5113930215357829E+00, -2.8993523187012922E+01, 6.6373454994590404E+01, -1.0329574518449559E+02, 1.0280184257681817E+02, -4.3896094875192006E+01, -4.3899302208087086E+01, 1.0280039795628096E+02, -1.0329511291885207E+02, 6.6373435700858948E+01, -2.8993536490606409E+01, 8.5113924808491728E+00, -1.5925952194145006E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c15[] = {1.5984868520881029E-02, 1.2876175212962959E-01, -9.8358742969175483E-01, 3.7711523389360830E+00, -9.4305498095765508E+00, 1.6842854581416674E+01, -2.2308566502972713E+01, 2.2308940200151390E+01, -1.6841512668820517E+01, 9.4313524091989347E+00, -3.7710716543179599E+00, 9.8361025494556609E-01, -1.2876100566420701E-01, -1.5984859433053292E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==15) { - FLT c0[] = {2.3939707792241839E+05, 9.7700272582690191E+08, 1.4715933396485257E+11, 4.7242424833337158E+12, 5.3987426629953594E+13, 2.7580474290566078E+14, 7.0693378336533400E+14, 9.6196578554477775E+14, 7.0693378336533400E+14, 2.7580474290566125E+14, 5.3987426629953766E+13, 4.7242424833337246E+12, 1.4715933396485263E+11, 9.7700272582690215E+08, 2.3939707792242285E+05, 0.0000000000000000E+00}; - FLT c1[] = {1.4314487885226035E+06, 2.9961416925358453E+09, 3.0273361232748438E+11, 6.8507333793903584E+12, 5.4192702756911000E+13, 1.7551587948105309E+14, 2.1874615668430150E+14, 3.4316191014053393E-02, -2.1874615668430150E+14, -1.7551587948105334E+14, -5.4192702756911180E+13, -6.8507333793903701E+12, -3.0273361232748438E+11, -2.9961416925358458E+09, -1.4314487885226049E+06, 0.0000000000000000E+00}; - FLT c2[] = {3.8829497354762917E+06, 4.2473082696966448E+09, 2.8414312556015540E+11, 4.3688281331121411E+12, 2.1823119508000543E+13, 3.2228098609392094E+13, -2.1833085454691789E+13, -7.3750710225100812E+13, -2.1833085454691820E+13, 3.2228098609392055E+13, 2.1823119508000594E+13, 4.3688281331121479E+12, 2.8414312556015527E+11, 4.2473082696966434E+09, 3.8829497354762889E+06, 0.0000000000000000E+00}; - FLT c3[] = {6.3495763451755755E+06, 3.6841035003733950E+09, 1.5965774278321045E+11, 1.5630338683778201E+12, 3.8749058615819268E+12, -2.7319740087723574E+12, -1.3233342822865402E+13, 6.1642230420317079E-02, 1.3233342822865449E+13, 2.7319740087723975E+12, -3.8749058615819365E+12, -1.5630338683778203E+12, -1.5965774278321042E+11, -3.6841035003733935E+09, -6.3495763451755764E+06, 0.0000000000000000E+00}; - FLT c4[] = {7.0146619045520434E+06, 2.1782897863065763E+09, 5.8897780310148087E+10, 3.1953009601770325E+11, 4.0651527029737198E+08, -1.6379148273276064E+12, -1.1568753137013029E+11, 2.7451653250460508E+12, -1.1568753137012485E+11, -1.6379148273277261E+12, 4.0651527029819238E+08, 3.1953009601770361E+11, 5.8897780310148087E+10, 2.1782897863065763E+09, 7.0146619045520443E+06, 0.0000000000000000E+00}; - FLT c5[] = {5.5580012413990172E+06, 9.2345162185944164E+08, 1.4522950934020109E+10, 2.7025952371212009E+10, -1.2304576967641914E+11, -1.0116752717202786E+11, 3.8517418245458325E+11, 1.0918347404432817E-01, -3.8517418245444312E+11, 1.0116752717221135E+11, 1.2304576967643665E+11, -2.7025952371214943E+10, -1.4522950934020079E+10, -9.2345162185944211E+08, -5.5580012413990181E+06, 0.0000000000000000E+00}; - FLT c6[] = {3.2693972344231778E+06, 2.8610260147425205E+08, 2.2348528403750563E+09, -3.4574515574242272E+09, -1.7480626463583939E+10, 3.1608597465540653E+10, 1.9879262560072273E+10, -6.6148013553772224E+10, 1.9879262560085339E+10, 3.1608597465515747E+10, -1.7480626463576942E+10, -3.4574515574198236E+09, 2.2348528403750110E+09, 2.8610260147425193E+08, 3.2693972344231787E+06, 0.0000000000000000E+00}; - FLT c7[] = {1.4553539959296256E+06, 6.4136842048384041E+07, 1.3622336582062906E+08, -1.2131510424644001E+09, 6.4322366984221375E+08, 4.5078753872047586E+09, -7.1689413746930647E+09, 3.2906916833662987E-02, 7.1689413746724453E+09, -4.5078753875009747E+09, -6.4322366985365331E+08, 1.2131510424608817E+09, -1.3622336582067037E+08, -6.4136842048384242E+07, -1.4553539959296256E+06, 0.0000000000000000E+00}; - FLT c8[] = {4.9358776531681651E+05, 9.7772970960585065E+06, -2.3511574237987626E+07, -1.0142613816641946E+08, 3.9421144218035364E+08, -2.8449115593052310E+08, -5.7549243243741119E+08, 1.1608781631182449E+09, -5.7549243240763104E+08, -2.8449115600447333E+08, 3.9421144214381480E+08, -1.0142613816429654E+08, -2.3511574237995699E+07, 9.7772970960588697E+06, 4.9358776531681546E+05, 0.0000000000000000E+00}; - FLT c9[] = {1.2660319987326677E+05, 7.7519511328119377E+05, -6.5244610661450895E+06, 9.0878257488052379E+06, 2.3116605621149920E+07, -8.7079594462079599E+07, 9.5542733739275128E+07, 6.0548970733798724E-02, -9.5542733661364838E+07, 8.7079594608550951E+07, -2.3116605559600785E+07, -9.0878257522138134E+06, 6.5244610661298726E+06, -7.7519511328133650E+05, -1.2660319987326639E+05, 0.0000000000000000E+00}; - FLT c10[] = {2.3793325531458529E+04, -4.2305332803808597E+04, -5.2884156985535356E+05, 2.5307340127864038E+06, -4.0404175271559842E+06, -1.7519992360184138E+05, 1.0146438805818636E+07, -1.5828545480742473E+07, 1.0146438778928882E+07, -1.7520004389869148E+05, -4.0404175770437294E+06, 2.5307340149977510E+06, -5.2884156989405944E+05, -4.2305332803937294E+04, 2.3793325531459184E+04, 0.0000000000000000E+00}; - FLT c11[] = {2.9741655196834722E+03, -2.0687056403786246E+04, 3.3295507799709936E+04, 1.0661145730323243E+05, -5.6644238105382060E+05, 1.0874811616841732E+06, -9.6561270266008016E+05, 1.5626594062671070E-02, 9.6561272951271443E+05, -1.0874812528712249E+06, 5.6644243308078672E+05, -1.0661145838213131E+05, -3.3295507812197495E+04, 2.0687056403630129E+04, -2.9741655196846405E+03, 0.0000000000000000E+00}; - FLT c12[] = {1.5389176594899303E+02, -2.3864418511494741E+03, 1.0846266954249364E+04, -2.2940053396478714E+04, 1.4780106121058996E+04, 4.2663651769852157E+04, -1.3047648013242516E+05, 1.7468401314164279E+05, -1.3047645484607235E+05, 4.2663541429144650E+04, 1.4780036296018619E+04, -2.2940053180976502E+04, 1.0846266927315819E+04, -2.3864418517113058E+03, 1.5389176594779781E+02, 0.0000000000000000E+00}; - FLT c13[] = {-2.3857631312588978E+01, -1.9651606133609231E+01, 6.4183083829803820E+02, -2.8648433109641578E+03, 6.8249243722518859E+03, -9.7944325124827701E+03, 7.6177757600121276E+03, 1.8034307737205296E-02, -7.6177559127722052E+03, 9.7944326623113047E+03, -6.8249058342322496E+03, 2.8648407117981119E+03, -6.4183085438795774E+02, 1.9651605969778377E+01, 2.3857631312809222E+01, 0.0000000000000000E+00}; - FLT c14[] = {-6.1348505739169541E+00, 2.7872915855267404E+01, -6.5819942538871970E+01, 5.1366231962952028E+01, 1.7213955398158618E+02, -6.9658621010000411E+02, 1.3192236112353403E+03, -1.6054106225233884E+03, 1.3192031991952242E+03, -6.9663961216547739E+02, 1.7211403815802629E+02, 5.1367579954366171E+01, -6.5819957939661379E+01, 2.7872915947616441E+01, -6.1348505735855374E+00, 0.0000000000000000E+00}; - FLT c15[] = {-4.9671584513490097E-01, 3.0617550953446115E+00, -1.1650665638578070E+01, 3.0081586723089057E+01, -5.4028356726202020E+01, 6.6077203078498044E+01, -4.7145500171928198E+01, 4.2118837140985958E-03, 4.7167106663349848E+01, -6.6048394423269173E+01, 5.4062906728994193E+01, -3.0081603709324451E+01, 1.1650672008416343E+01, -3.0617551285208524E+00, 4.9671584437353217E-01, 0.0000000000000000E+00}; - FLT c16[] = {4.3460786767313729E-03, -1.3199600771767199E-02, -1.9412688562910244E-01, 1.1329433700669471E+00, -3.4442045795063887E+00, 7.1737626956468912E+00, -1.1098109271625262E+01, 1.2385772358881393E+01, -1.1101471316239516E+01, 7.0913926025978853E+00, -3.4845491148773502E+00, 1.1323523856621058E+00, -1.9414904754428672E-01, -1.3200165079792004E-02, 4.3460782759443158E-03, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else if (w==16) { - FLT c0[] = {3.6434551345570839E+05, 2.0744705928579483E+09, 4.0355760945669995E+11, 1.6364575388763029E+13, 2.3514830376056538E+14, 1.5192201717462528E+15, 4.9956173084674090E+15, 8.9287666945127360E+15, 8.9287666945127390E+15, 4.9956173084674090E+15, 1.5192201717462528E+15, 2.3514830376056538E+14, 1.6364575388763035E+13, 4.0355760945670026E+11, 2.0744705928579524E+09, 3.6434551345571183E+05}; - FLT c1[] = {2.2576246485480359E+06, 6.6499571180086451E+09, 8.7873753526056287E+11, 2.5606844387131066E+13, 2.6313738449330153E+14, 1.1495095100701460E+15, 2.1932582707747560E+15, 1.2860244365132595E+15, -1.2860244365132600E+15, -2.1932582707747578E+15, -1.1495095100701465E+15, -2.6313738449330159E+14, -2.5606844387131062E+13, -8.7873753526056299E+11, -6.6499571180086451E+09, -2.2576246485480373E+06}; - FLT c2[] = {6.3730995546265077E+06, 9.9060026035198078E+09, 8.8097248605449023E+11, 1.7953384130753688E+13, 1.2398425545001662E+14, 3.0749346493041262E+14, 1.0259777520247159E+14, -5.5291976457534325E+14, -5.5291976457534325E+14, 1.0259777520247186E+14, 3.0749346493041219E+14, 1.2398425545001659E+14, 1.7953384130753676E+13, 8.8097248605448950E+11, 9.9060026035198040E+09, 6.3730995546265030E+06}; - FLT c3[] = {1.0896915393078227E+07, 9.0890343524593849E+09, 5.3565169504010010E+11, 7.3004206720038701E+12, 2.9692333044160066E+13, 1.6051737468109549E+13, -9.1273329108089906E+13, -8.5999306918502953E+13, 8.5999306918502422E+13, 9.1273329108089984E+13, -1.6051737468109510E+13, -2.9692333044160082E+13, -7.3004206720038701E+12, -5.3565169504010022E+11, -9.0890343524593849E+09, -1.0896915393078227E+07}; - FLT c4[] = {1.2655725616100594E+07, 5.7342804054544210E+09, 2.1822836608899570E+11, 1.8300700858999690E+12, 2.7770431049857676E+12, -8.5034969223852568E+12, -1.2846668467423438E+13, 1.6519076896571838E+13, 1.6519076896572182E+13, -1.2846668467423555E+13, -8.5034969223850703E+12, 2.7770431049857896E+12, 1.8300700858999678E+12, 2.1822836608899567E+11, 5.7342804054544210E+09, 1.2655725616100591E+07}; - FLT c5[] = {1.0609303958036326E+07, 2.6255609052371716E+09, 6.1673589426039413E+10, 2.6044432099085333E+11, -3.5431628074578204E+11, -1.6077602129636348E+12, 1.5534405614728977E+12, 2.8019935380857432E+12, -2.8019935380841978E+12, -1.5534405614724106E+12, 1.6077602129635625E+12, 3.5431628074580896E+11, -2.6044432099084848E+11, -6.1673589426039429E+10, -2.6255609052371716E+09, -1.0609303958036322E+07}; - FLT c6[] = {6.6544809363384582E+06, 8.9490403680928326E+08, 1.1882638725190845E+10, 8.1552898137823076E+09, -1.2575562817886868E+11, 2.7074695075907585E+10, 3.9453789461955023E+11, -3.1679644857468066E+11, -3.1679644857392346E+11, 3.9453789461966650E+11, 2.7074695075992649E+10, -1.2575562817884555E+11, 8.1552898137788668E+09, 1.1882638725190889E+10, 8.9490403680928278E+08, 6.6544809363384554E+06}; - FLT c7[] = {3.1906872142825006E+06, 2.2785946180651775E+08, 1.3744578972809248E+09, -4.3997172592883167E+09, -9.2011130754043922E+09, 3.4690551711832901E+10, -9.4227043395047741E+09, -5.9308465070198639E+10, 5.9308465069336540E+10, 9.4227043396350136E+09, -3.4690551711738396E+10, 9.2011130753567543E+09, 4.3997172592879610E+09, -1.3744578972813025E+09, -2.2785946180651844E+08, -3.1906872142825015E+06}; - FLT c8[] = {1.1821527096621769E+06, 4.2281234059839502E+07, 2.8723226058712766E+07, -8.3553955857628822E+08, 1.2447304828823066E+09, 2.1955280943585949E+09, -7.0514195726908512E+09, 4.3745141239718714E+09, 4.3745141233600502E+09, -7.0514195728029747E+09, 2.1955280943510208E+09, 1.2447304828590808E+09, -8.3553955857879233E+08, 2.8723226058761366E+07, 4.2281234059838109E+07, 1.1821527096621762E+06}; - FLT c9[] = {3.3854610744280310E+05, 5.2176984975081543E+06, -2.0677283565079328E+07, -3.5831818968518838E+07, 2.6599346106412742E+08, -3.7992777977357000E+08, -1.3426914417466179E+08, 9.1752051229224503E+08, -9.1752051129499328E+08, 1.3426914497246322E+08, 3.7992777991069216E+08, -2.6599346104854536E+08, 3.5831818968908392E+07, 2.0677283564896725E+07, -5.2176984975075833E+06, -3.3854610744279937E+05}; - FLT c10[] = {7.3893334077310064E+04, 2.6983804209559254E+05, -3.6415998561101072E+06, 8.4025485849181097E+06, 4.9278860779345948E+06, -5.1437033846752726E+07, 8.7603898676325440E+07, -4.6199498412402093E+07, -4.6199498208604209E+07, 8.7603898435731798E+07, -5.1437033863736227E+07, 4.9278861005789889E+06, 8.4025485831489991E+06, -3.6415998560990733E+06, 2.6983804209473461E+05, 7.3893334077307401E+04}; - FLT c11[] = {1.1778892113375481E+04, -4.0077190108724200E+04, -1.8372552175909068E+05, 1.3262878399160223E+06, -2.9738539927520575E+06, 1.9493509709529271E+06, 4.1881949951139782E+06, -1.1066749616505133E+07, 1.1066749327519676E+07, -4.1881946843906553E+06, -1.9493507810665092E+06, 2.9738539818831389E+06, -1.3262878384774840E+06, 1.8372552162922107E+05, 4.0077190107319519E+04, -1.1778892113376129E+04}; - FLT c12[] = {1.2019749667923656E+03, -1.0378455844500613E+04, 2.6333352653155256E+04, 1.7117060106301305E+04, -2.5133287443653666E+05, 6.4713914262131555E+05, -8.1634942572553246E+05, 3.8623935281825601E+05, 3.8623876433339820E+05, -8.1634960962672008E+05, 6.4713900469564367E+05, -2.5133289627502396E+05, 1.7117057951236206E+04, 2.6333352581335013E+04, -1.0378455846609291E+04, 1.2019749667911419E+03}; - FLT c13[] = {3.1189837632471693E+01, -8.9083493807061564E+02, 4.9454293649337906E+03, -1.3124693635095375E+04, 1.5834784331991095E+04, 6.9607870364081436E+03, -5.9789871879430451E+04, 1.0841726514394575E+05, -1.0841709685990328E+05, 5.9790206615067997E+04, -6.9607049368128291E+03, -1.5834783935893831E+04, 1.3124692974990443E+04, -4.9454295091588992E+03, 8.9083493794871868E+02, -3.1189837631106176E+01}; - FLT c14[] = {-1.2975319073401824E+01, 1.8283698218710011E+01, 1.7684015393859755E+02, -1.1059917445033070E+03, 3.1998168298121523E+03, -5.5988200120063057E+03, 5.9248751921324047E+03, -2.5990022806343668E+03, -2.5990962125709430E+03, 5.9247537039895724E+03, -5.5988835070734467E+03, 3.1998292349030621E+03, -1.1059926481090836E+03, 1.7684013881079576E+02, 1.8283698123134819E+01, -1.2975319073977776E+01}; - FLT c15[] = {-2.3155118729954247E+00, 1.1938503634469159E+01, -3.4150562973753665E+01, 4.8898615554511437E+01, 1.5853185548633874E+01, -2.4272678107130790E+02, 6.0151276286907887E+02, -8.8751856926690448E+02, 8.8742942550355474E+02, -6.0136491467620624E+02, 2.4282489356694586E+02, -1.5850195971204462E+01, -4.8897392545563044E+01, 3.4150562973753665E+01, -1.1938504430698943E+01, 2.3155118723150525E+00}; - FLT c16[] = {-1.5401723686076832E-01, 9.8067823888634464E-01, -4.1900843552415639E+00, 1.2150534299778382E+01, -2.4763139606227178E+01, 3.6068014621628578E+01, -3.4346647779134791E+01, 1.3259903958585387E+01, 1.2937147675617604E+01, -3.4454233206790519E+01, 3.6027670086257579E+01, -2.4769863695455662E+01, 1.2149431128889342E+01, -4.1901615115388706E+00, 9.8067695636810759E-01, -1.5401723756214594E-01}; - FLT c17[] = {1.1808835093099178E-02, -2.5444299558662394E-02, -1.5661344238792723E-04, 2.5820071204205225E-01, -1.0930950485268096E+00, 2.6408492552008669E+00, -4.4415763059111955E+00, 6.8227366238712817E+00, -6.8186662643534008E+00, 4.4887924763186051E+00, -2.6327085361651021E+00, 1.0918739406714428E+00, -2.5844238963842503E-01, 1.2680123888735934E-04, 2.5444206395526567E-02, -1.1808834826225629E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i] + z*(c17[i]))))))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/src/ker_lowupsampfac_horner_allw_loop.c b/src/ker_lowupsampfac_horner_allw_loop.c deleted file mode 100644 index 7c4517f57..000000000 --- a/src/ker_lowupsampfac_horner_allw_loop.c +++ /dev/null @@ -1,192 +0,0 @@ -// Code generated by gen_all_horner_C_code.m in finufft/devel -// Authors: Alex Barnett & Ludvig af Klinteberg. -// (C) The Simons Foundation, Inc. - if (w==2) { - FLT c0[] = {2.3711015472112514E+01, 2.3711015472112514E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {2.5079742199350562E+01, -2.5079742199350562E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {-3.5023281580177050E+00, -3.5023281580177086E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {-7.3894949249195587E+00, 7.3894949249195632E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i]))); - } else if (w==3) { - FLT c0[] = {5.9620016143346824E+01, 2.4110216701187497E+02, 5.9620016148621815E+01, 0.0000000000000000E+00}; - FLT c1[] = {9.7575520958604258E+01, 9.4807967775797928E-16, -9.7575520952908519E+01, 0.0000000000000000E+00}; - FLT c2[] = {3.5838417859768512E+01, -7.3472145274965371E+01, 3.5838417865129472E+01, 0.0000000000000000E+00}; - FLT c3[] = {-1.0721643298166471E+01, -2.1299978194824344E-16, 1.0721643303220413E+01, 0.0000000000000000E+00}; - FLT c4[] = {-7.0570630207138318E+00, 9.1538553399011260E+00, -7.0570630151506633E+00, 0.0000000000000000E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i])))); - } else if (w==4) { - FLT c0[] = {1.2612470018753689E+02, 1.1896204292999116E+03, 1.1896204292999118E+03, 1.2612470018753696E+02}; - FLT c1[] = {2.6158034850676626E+02, 5.6161104654809810E+02, -5.6161104654809844E+02, -2.6158034850676620E+02}; - FLT c2[] = {1.7145379463699527E+02, -1.6695967127766517E+02, -1.6695967127766514E+02, 1.7145379463699527E+02}; - FLT c3[] = {2.3525961965887870E+01, -1.0057439659768858E+02, 1.0057439659768873E+02, -2.3525961965887827E+01}; - FLT c4[] = {-1.5608307370340880E+01, 9.5627412100260845E+00, 9.5627412100260205E+00, -1.5608307370340908E+01}; - FLT c5[] = {-4.5715207776748699E+00, 7.9904373067895493E+00, -7.9904373067893877E+00, 4.5715207776749462E+00}; - for (int i=0; i<4; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i]))))); - } else if (w==5) { - FLT c0[] = {2.4106943677442615E+02, 4.3538384278025542E+03, 9.3397486707381995E+03, 4.3538384278025515E+03, 2.4106943677442607E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {5.8781364250328272E+02, 3.4742855804122028E+03, -7.3041306797303120E-14, -3.4742855804122009E+03, -5.8781364250328249E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {5.1234107167555862E+02, 3.5219546517037116E+02, -1.7076861141633149E+03, 3.5219546517037247E+02, 5.1234107167555862E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.7540956907856057E+02, -3.5792356187777074E+02, -4.9888896652511712E-13, 3.5792356187777165E+02, -1.7540956907856059E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {-2.1768066955094961E-01, -7.8322173187697558E+01, 1.3904039464934516E+02, -7.8322173187697842E+01, -2.1768066955103071E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {-1.4207955403641256E+01, 1.6019466986221790E+01, 5.4386376890865855E-13, -1.6019466986220916E+01, 1.4207955403641320E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {-2.1966493586753826E+00, 5.0672636163194582E+00, -6.7340544905090631E+00, 5.0672636163189448E+00, -2.1966493586753089E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i])))))); - } else if (w==6) { - FLT c0[] = {4.3011762559089101E+02, 1.3368828836127070E+04, 4.9861340433371224E+04, 4.9861340433371253E+04, 1.3368828836127073E+04, 4.3011762559835148E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {1.1857225840065141E+03, 1.4112553227730617E+04, 1.5410005180819440E+04, -1.5410005180819426E+04, -1.4112553227730616E+04, -1.1857225839984601E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.2460481448413077E+03, 4.3127030215084960E+03, -5.5438591621431169E+03, -5.5438591621431306E+03, 4.3127030215084960E+03, 1.2460481448488902E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {6.0825549344387753E+02, -3.4106010789547094E+02, -1.9775725023673197E+03, 1.9775725023673208E+03, 3.4106010789547116E+02, -6.0825549343673094E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {1.1264961069783706E+02, -3.9740822717991142E+02, 2.7557540616463064E+02, 2.7557540616462472E+02, -3.9740822717991210E+02, 1.1264961070570448E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {-1.5387906304333878E+01, -3.2640579296387394E+01, 1.1683718215647470E+02, -1.1683718215646800E+02, 3.2640579296390861E+01, 1.5387906311562851E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {-9.3947198873910249E+00, 1.5069930500881778E+01, -8.0900452409597179E+00, -8.0900452409538364E+00, 1.5069930500884301E+01, -9.3947198802581902E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {-5.6048841964539509E-01, 2.3377422080924530E+00, -4.2391567591836514E+00, 4.2391567591841817E+00, -2.3377422080928629E+00, 5.6048842664294984E-01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i]))))))); - } else if (w==7) { - FLT c0[] = {7.2950392616203249E+02, 3.6439117038309480E+04, 2.1220891582018422E+05, 3.6180058567561524E+05, 2.1220891582018445E+05, 3.6439117038309487E+04, 7.2950392617434545E+02, 0.0000000000000000E+00}; - FLT c1[] = {2.2197790785452576E+03, 4.6392067080426248E+04, 1.1568051746995670E+05, -1.1902861988308852E-11, -1.1568051746995671E+05, -4.6392067080426241E+04, -2.2197790785319785E+03, 0.0000000000000000E+00}; - FLT c2[] = {2.6796845075663955E+03, 2.0921129984587249E+04, 3.9399551345574849E+01, -4.7251335435527435E+04, 3.9399551345580633E+01, 2.0921129984587245E+04, 2.6796845075789142E+03, 0.0000000000000000E+00}; - FLT c3[] = {1.6253748990844499E+03, 2.6138488347211564E+03, -1.0037546705421508E+04, 2.6823166126907972E-11, 1.0037546705421508E+04, -2.6138488347211546E+03, -1.6253748990726619E+03, 0.0000000000000000E+00}; - FLT c4[] = {4.9106375852553418E+02, -8.6668269315416171E+02, -1.0513434716618249E+03, 2.8444456471590756E+03, -1.0513434716618387E+03, -8.6668269315416057E+02, 4.9106375853851472E+02, 0.0000000000000000E+00}; - FLT c5[] = {4.0739167949763157E+01, -2.8515155742293922E+02, 3.9930326803801455E+02, 2.4847312048933061E-11, -3.9930326803798215E+02, 2.8515155742293899E+02, -4.0739167937835738E+01, 0.0000000000000000E+00}; - FLT c6[] = {-1.7148987139838667E+01, 7.5799002551700223E-01, 6.3260304953160343E+01, -1.0529869309160161E+02, 6.3260304953194023E+01, 7.5799002552709915E-01, -1.7148987128069749E+01, 0.0000000000000000E+00}; - FLT c7[] = {-4.5424411501060264E+00, 9.8749254058318616E+00, -9.6456179777547195E+00, 2.0621161109877312E-11, 9.6456179778118027E+00, -9.8749254058319202E+00, 4.5424411616514604E+00, 0.0000000000000000E+00}; - FLT c8[] = {-5.0793946806832954E-02, 7.3273813711856639E-01, -2.0117140544738263E+00, 2.6999257940856816E+00, -2.0117140545416512E+00, 7.3273813711318592E-01, -5.0793935653327994E-02, 0.0000000000000000E+00}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i])))))))); - } else if (w==8) { - FLT c0[] = {1.1895823653767145E+03, 9.0980236725236929E+04, 7.7438826909537485E+05, 2.0077596413122697E+06, 2.0077596413122697E+06, 7.7438826909537497E+05, 9.0980236725236929E+04, 1.1895823653767147E+03}; - FLT c1[] = {3.9313191526977798E+03, 1.3318570706800820E+05, 5.7275848637687636E+05, 4.6250273225257988E+05, -4.6250273225257976E+05, -5.7275848637687659E+05, -1.3318570706800820E+05, -3.9313191526977798E+03}; - FLT c2[] = {5.2976026193612370E+03, 7.5628970871188430E+04, 1.0073339198368321E+05, -1.8165150843791291E+05, -1.8165150843791291E+05, 1.0073339198368321E+05, 7.5628970871188460E+04, 5.2976026193612397E+03}; - FLT c3[] = {3.7552239608473842E+03, 1.8376340228970901E+04, -2.3878081117551585E+04, -4.6296734056047833E+04, 4.6296734056048226E+04, 2.3878081117551632E+04, -1.8376340228970901E+04, -3.7552239608473833E+03}; - FLT c4[] = {1.4742862505418652E+03, 1.2842168112178376E+02, -9.1969665138398723E+03, 7.5990739935234687E+03, 7.5990739935234151E+03, -9.1969665138399178E+03, 1.2842168112178072E+02, 1.4742862505418645E+03}; - FLT c5[] = {2.8158981009344416E+02, -8.8613607108855206E+02, 5.3457145342334378E+01, 2.1750989694614777E+03, -2.1750989694609211E+03, -5.3457145342173561E+01, 8.8613607108856843E+02, -2.8158981009344393E+02}; - FLT c6[] = {-1.4786862436240726E+00, -1.3935442261830281E+02, 3.2599325739083491E+02, -1.9541889343332295E+02, -1.9541889343339443E+02, 3.2599325739083696E+02, -1.3935442261827953E+02, -1.4786862436237442E+00}; - FLT c7[] = {-1.1542034522902307E+01, 1.2000512051397084E+01, 1.9687328710129744E+01, -6.3962883082482271E+01, 6.3962883082874910E+01, -1.9687328710101575E+01, -1.2000512051407391E+01, 1.1542034522902124E+01}; - FLT c8[] = {-1.7448292513542445E+00, 4.8577330433956609E+00, -6.8794163043773890E+00, 3.4611708987408365E+00, 3.4611708985348386E+00, -6.8794163043605385E+00, 4.8577330433771184E+00, -1.7448292513550807E+00}; - FLT c9[] = {1.5044951479021193E-01, 9.6230159355094713E-02, -7.0399250398052082E-01, 1.3251401132916929E+00, -1.3251401128795544E+00, 7.0399250407339709E-01, -9.6230159355094713E-02, -1.5044951479003055E-01}; - for (int i=0; i<8; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==9) { - FLT c0[] = {1.8793738965776997E+03, 2.1220891582018419E+05, 2.5233246441351641E+06, 9.2877384983420596E+06, 1.4015330434461458E+07, 9.2877384983420689E+06, 2.5233246441351632E+06, 2.1220891582018507E+05, 1.8793738965777015E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {6.6675066501609344E+03, 3.4704155240986997E+05, 2.2890184838322559E+06, 3.8705035445351214E+06, -1.6037058324963857E-09, -3.8705035445351251E+06, -2.2890184838322555E+06, -3.4704155240987107E+05, -6.6675066501609363E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {9.8412775404612330E+03, 2.3171563090202375E+05, 6.8167589492092200E+05, -2.1140963571671984E+05, -1.4236515118873848E+06, -2.1140963571672366E+05, 6.8167589492092165E+05, 2.3171563090202425E+05, 9.8412775404612312E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {7.8762358364031033E+03, 7.6500585979636104E+04, 1.2434778984075023E+04, -2.8572091469430045E+05, 1.5952874106327477E-09, 2.8572091469430359E+05, -1.2434778984075045E+04, -7.6500585979636220E+04, -7.8762358364031052E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {3.6941911906762084E+03, 9.9232929169975941E+03, -3.3472877669902169E+04, -1.4082384858052235E+04, 6.7911966136972551E+04, -1.4082384858047793E+04, -3.3472877669902322E+04, 9.9232929169976087E+03, 3.6941911906762070E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {9.8900189723050266E+02, -1.2736589324621855E+03, -5.0407308390126955E+03, 9.8914296140171609E+03, 1.0742991696587890E-09, -9.8914296140222541E+03, 5.0407308390134704E+03, 1.2736589324621880E+03, -9.8900189723050198E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {1.1165868717715853E+02, -5.9057035448564977E+02, 5.5860705835603983E+02, 9.1996097522959656E+02, -2.0290255886377897E+03, 9.1996097523001129E+02, 5.5860705835622480E+02, -5.9057035448564693E+02, 1.1165868717715870E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {-1.3142584300868881E+01, -4.2852762793304592E+01, 1.8188640945795066E+02, -2.1362000457567430E+02, 6.1024810759112463E-10, 2.1362000457722939E+02, -1.8188640945795305E+02, 4.2852762793363922E+01, 1.3142584300866494E+01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {-5.8088068374878068E+00, 1.0201832931362965E+01, -3.5220973519213472E-01, -2.6632420896811951E+01, 4.2737607182672249E+01, -2.6632420895534445E+01, -3.5220973562147767E-01, 1.0201832931230712E+01, -5.8088068374901178E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {-4.0642645973308456E-01, 1.8389772328416343E+00, -3.5549484953682806E+00, 3.2273562233914270E+00, 1.3413454081272250E-09, -3.2273562258526494E+00, 3.5549484959023196E+00, -1.8389772328242200E+00, 4.0642645973371377E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i]))))))))); - } else if (w==10) { - FLT c0[] = {2.8923571298063562E+03, 4.6856831608341925E+05, 7.5304732752870023E+06, 3.7576537584215783E+07, 7.9591606307847857E+07, 7.9591606307847857E+07, 3.7576537584215745E+07, 7.5304732752870042E+06, 4.6856831608341780E+05, 2.8923571298063575E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {1.0919387804943191E+04, 8.3976685277206497E+05, 7.9494027659552367E+06, 2.1606786285174552E+07, 1.4625897641453246E+07, -1.4625897641453277E+07, -2.1606786285174549E+07, -7.9494027659552367E+06, -8.3976685277206241E+05, -1.0919387804943171E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.7418455635504150E+04, 6.3489952164419880E+05, 3.1358985409389879E+06, 2.2547438801903646E+06, -6.0429762783920728E+06, -6.0429762783920513E+06, 2.2547438801903692E+06, 3.1358985409389860E+06, 6.3489952164419706E+05, 1.7418455635504110E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.5396188098732160E+04, 2.5490607173283451E+05, 4.2818880748176615E+05, -9.5435463094349275E+05, -1.2004850139039254E+06, 1.2004850139039545E+06, 9.5435463094349345E+05, -4.2818880748176581E+05, -2.5490607173283395E+05, -1.5396188098732138E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {8.2616700456447434E+03, 5.2880641964112285E+04, -6.1165055141131161E+04, -2.1590299490711108E+05, 2.1595822052157650E+05, 2.1595822052157007E+05, -2.1590299490713840E+05, -6.1165055141131197E+04, 5.2880641964112183E+04, 8.2616700456447306E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {2.7267169079066489E+03, 2.4572549134030801E+03, -2.6065821571078384E+04, 1.3919259807559451E+04, 4.6802084705699206E+04, -4.6802084705714289E+04, -1.3919259807536537E+04, 2.6065821571078890E+04, -2.4572549134029036E+03, -2.7267169079066425E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {5.0402062537834070E+02, -1.3640153425625381E+03, -1.4063198459019245E+03, 7.0858129627834105E+03, -4.8375233777605163E+03, -4.8375233777670810E+03, 7.0858129627894641E+03, -1.4063198459014579E+03, -1.3640153425626913E+03, 5.0402062537833700E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {2.4199726682542348E+01, -2.8393731159249540E+02, 5.1652001352543709E+02, 7.4578914842705018E+01, -1.1556759026365337E+03, 1.1556759026651935E+03, -7.4578914839714216E+01, -5.1652001352595710E+02, 2.8393731159268043E+02, -2.4199726682540959E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {-1.0545675122360885E+01, -3.0306758891224317E+00, 7.2305523762173834E+01, -1.3808908570221064E+02, 7.6293213403386517E+01, 7.6293213419205742E+01, -1.3808908572505672E+02, 7.2305523760424833E+01, -3.0306758894244412E+00, -1.0545675122369961E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {-2.1836930570474395E+00, 5.4992367509081630E+00, -4.5624617253163446E+00, -6.6492709819863256E+00, 2.0339240341691568E+01, -2.0339240351164950E+01, 6.6492710020476089E+00, 4.5624617253163446E+00, -5.4992367508501152E+00, 2.1836930570530630E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {-9.1748741459757727E-02, 5.2562451739588611E-01, -1.4144257958835973E+00, 1.8629578990262812E+00, -9.0169874554123419E-01, -9.0169876258108816E-01, 1.8629579026113960E+00, -1.4144257947447987E+00, 5.2562451738534777E-01, -9.1748741464373396E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i])))))))))); - } else if (w==11) { - FLT c0[] = {4.3537972057094357E+03, 9.8872306817881018E+05, 2.0938056062983289E+07, 1.3701428307175827E+08, 3.8828289972017348E+08, 5.4292197128519189E+08, 3.8828289972017324E+08, 1.3701428307175821E+08, 2.0938056062983286E+07, 9.8872306817881099E+05, 4.3537972057093830E+03, 0.0000000000000000E+00}; - FLT c1[] = {1.7371472778611496E+04, 1.9155790709433770E+06, 2.4914432724618733E+07, 9.7792160665338323E+07, 1.3126779387874992E+08, 1.1003518489948497E-08, -1.3126779387874992E+08, -9.7792160665338367E+07, -2.4914432724618725E+07, -1.9155790709433774E+06, -1.7371472778611387E+04, 0.0000000000000000E+00}; - FLT c2[] = {2.9650558537745437E+04, 1.6014973065836846E+06, 1.1867448782239100E+07, 2.0812212822540633E+07, -1.1749875870571069E+07, -4.5121922350041404E+07, -1.1749875870571032E+07, 2.0812212822540659E+07, 1.1867448782239093E+07, 1.6014973065836851E+06, 2.9650558537745299E+04, 0.0000000000000000E+00}; - FLT c3[] = {2.8505604980264394E+04, 7.4166660874053277E+05, 2.5711466441825330E+06, -1.2146931938153899E+06, -8.3931576510116160E+06, -1.5221113764487218E-08, 8.3931576510117017E+06, 1.2146931938154220E+06, -2.5711466441825316E+06, -7.4166660874053324E+05, -2.8505604980264285E+04, 0.0000000000000000E+00}; - FLT c4[] = {1.7045632829988481E+04, 1.9785834209758078E+05, 8.6361403553701501E+04, -1.0584472412326147E+06, -1.3367486018960556E+05, 1.7818009619467217E+06, -1.3367486018952832E+05, -1.0584472412326441E+06, 8.6361403553699885E+04, 1.9785834209758087E+05, 1.7045632829988419E+04, 0.0000000000000000E+00}; - FLT c5[] = {6.5462464716912918E+03, 2.5347576368078855E+04, -7.5810878908805942E+04, -8.0774039751690128E+04, 2.5492801112955116E+05, 3.6655592491345995E-08, -2.5492801112950110E+05, 8.0774039751702396E+04, 7.5810878908810162E+04, -2.5347576368078677E+04, -6.5462464716912700E+03, 0.0000000000000000E+00}; - FLT c6[] = {1.5684149291082115E+03, -1.0302687059852267E+03, -1.3446845770824435E+04, 2.0814393480320545E+04, 1.4366994276523908E+04, -4.4581342385955380E+04, 1.4366994276463982E+04, 2.0814393480325110E+04, -1.3446845770824308E+04, -1.0302687059850016E+03, 1.5684149291082128E+03, 0.0000000000000000E+00}; - FLT c7[] = {1.9398419323286222E+02, -8.7329293867281388E+02, 2.4796533428938184E+02, 3.2905701326623416E+03, -4.8989871768459579E+03, 2.8861239463615327E-09, 4.8989871768722078E+03, -3.2905701326312101E+03, -2.4796533429068171E+02, 8.7329293867237629E+02, -1.9398419323287882E+02, 0.0000000000000000E+00}; - FLT c8[] = {-4.2288232505124679E+00, -9.9574929618003850E+01, 2.9563077146126534E+02, -1.9453049352240328E+02, -4.0107401572039475E+02, 7.9532514195009401E+02, -4.0107401576942334E+02, -1.9453049354949908E+02, 2.9563077145563869E+02, -9.9574929618160851E+01, -4.2288232505049734E+00, 0.0000000000000000E+00}; - FLT c9[] = {-5.3741131162167548E+00, 5.5350606003782072E+00, 1.9153744596147156E+01, -6.3189447483342484E+01, 6.6921287710344444E+01, 2.6543499136172006E-08, -6.6921287588490713E+01, 6.3189447458080132E+01, -1.9153744593546620E+01, -5.5350606004478644E+00, 5.3741131162113120E+00, 0.0000000000000000E+00}; - FLT c10[] = {-7.0359426508237854E-01, 2.2229112757468452E+00, -3.2054079720618520E+00, 8.3392526913327172E-02, 6.8879260281453520E+00, -1.0795498333352139E+01, 6.8879260220718077E+00, 8.3392507342704467E-02, -3.2054079702060019E+00, 2.2229112757257625E+00, -7.0359426507941902E-01, 0.0000000000000000E+00}; - FLT c11[] = {5.2648094861126392E-02, 9.9912561389764148E-02, -4.3913938527232693E-01, 7.9792987484770361E-01, -6.9191816827427566E-01, -1.2022534526020762E-09, 6.9191820562024531E-01, -7.9792984883890594E-01, 4.3913938443394634E-01, -9.9912561446925147E-02, -5.2648094869462925E-02, 0.0000000000000000E+00}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i]))))))))))); - } else if (w==12) { - FLT c0[] = {6.4299692685485315E+03, 2.0077596413122714E+06, 5.4904521978991628E+07, 4.5946106674819350E+08, 1.6835469840840104E+09, 3.1308386544851556E+09, 3.1308386544851556E+09, 1.6835469840840099E+09, 4.5946106674819458E+08, 5.4904521978991754E+07, 2.0077596413122730E+06, 6.4299692685634491E+03}; - FLT c1[] = {2.6965848540274073E+04, 4.1625245902732178E+06, 7.2097002594596952E+07, 3.8505085985474640E+08, 7.9479013671674240E+08, 4.7870231281824082E+08, -4.7870231281824046E+08, -7.9479013671674252E+08, -3.8505085985474682E+08, -7.2097002594597101E+07, -4.1625245902732178E+06, -2.6965848540258085E+04}; - FLT c2[] = {4.8869694409905111E+04, 3.7863371066322513E+06, 3.9530526716552719E+07, 1.1475134266581042E+08, 4.6311261797930710E+07, -2.0442837194260675E+08, -2.0442837194260725E+08, 4.6311261797930680E+07, 1.1475134266581020E+08, 3.9530526716552787E+07, 3.7863371066322504E+06, 4.8869694409920470E+04}; - FLT c3[] = {5.0530564260114021E+04, 1.9615784087727289E+06, 1.1044597342441007E+07, 7.9812418612436540E+06, -3.4042228324588493E+07, -3.3301805987927791E+07, 3.3301805987928167E+07, 3.4042228324588671E+07, -7.9812418612435497E+06, -1.1044597342440993E+07, -1.9615784087727286E+06, -5.0530564260099913E+04}; - FLT c4[] = {3.3081876469965493E+04, 6.2011956881368335E+05, 1.3086001239863748E+06, -3.1165484297367339E+06, -5.1982996003442882E+06, 6.3530947749618590E+06, 6.3530947749616513E+06, -5.1982996003444213E+06, -3.1165484297366543E+06, 1.3086001239863599E+06, 6.2011956881368288E+05, 3.3081876469981333E+04}; - FLT c5[] = {1.4308966168506788E+04, 1.1375573205951916E+05, -1.0318195403424598E+05, -6.6892418721462542E+05, 5.9223570255461533E+05, 1.1093685152673351E+06, -1.1093685152666988E+06, -5.9223570255418238E+05, 6.6892418721489178E+05, 1.0318195403424004E+05, -1.1375573205951886E+05, -1.4308966168492358E+04}; - FLT c6[] = {4.0848961919700960E+03, 7.5033277163528910E+03, -5.2578904182711594E+04, 6.3431596329919275E+03, 1.5984798504282799E+05, -1.2521363434070408E+05, -1.2521363434057294E+05, 1.5984798504289921E+05, 6.3431596327853522E+03, -5.2578904182714803E+04, 7.5033277163530738E+03, 4.0848961919843541E+03}; - FLT c7[] = {7.1658797373677544E+02, -1.5499947984100402E+03, -4.5490740453241297E+03, 1.4520122796414065E+04, -3.7896465826366048E+03, -2.3597107892645658E+04, 2.3597107892708405E+04, 3.7896465828577311E+03, -1.4520122796272850E+04, 4.5490740453326107E+03, 1.5499947984094520E+03, -7.1658797372277388E+02}; - FLT c8[] = {5.2022749592533359E+01, -4.0624258132650436E+02, 5.2256582980122801E+02, 9.3282469962834807E+02, -2.8710622267611107E+03, 1.7594166903207245E+03, 1.7594166904840572E+03, -2.8710622269566602E+03, 9.3282469973848731E+02, 5.2256582976889342E+02, -4.0624258132718376E+02, 5.2022749606062760E+01}; - FLT c9[] = {-7.0341875498860729E+00, -2.3043166229077922E+01, 1.2279331781679724E+02, -1.6714687548507158E+02, -4.4746498424591195E+01, 3.6060906024962412E+02, -3.6060905985137049E+02, 4.4746498852565225E+01, 1.6714687549695972E+02, -1.2279331779599295E+02, 2.3043166228938606E+01, 7.0341875614861786E+00}; - FLT c10[] = {-2.1556100132617875E+00, 4.1361104009993737E+00, 1.8107701723532290E+00, -2.1223400322208619E+01, 3.5820961861882218E+01, -1.8782945665578143E+01, -1.8782945409136026E+01, 3.5820961915195049E+01, -2.1223400242576908E+01, 1.8107701298380314E+00, 4.1361104007462801E+00, -2.1556100021452793E+00}; - FLT c11[] = {-1.1440899376747954E-01, 7.0567641591060326E-01, -1.4530217904770133E+00, 1.0571984613482723E+00, 1.4389002957406878E+00, -4.2241732762744180E+00, 4.2241733421252539E+00, -1.4389000664821670E+00, -1.0571984509828731E+00, 1.4530218285851431E+00, -7.0567641613924970E-01, 1.1440900438178304E-01}; - FLT c12[] = {-1.4486009663463860E-02, 2.9387825785034223E-03, -1.0265969715607470E-01, 2.6748267835596640E-01, -3.3606430399849180E-01, 1.5850148085005597E-01, 1.5850183161365292E-01, -3.3606448814949358E-01, 2.6748281866164947E-01, -1.0265975004478733E-01, 2.9387817050372631E-03, -1.4486000369842612E-02}; - for (int i=0; i<12; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i])))))))))))); - } else if (w==13) { - FLT c0[] = {9.3397060605267689E+03, 3.9447202186643109E+06, 1.3701428307175812E+08, 1.4375660883001409E+09, 6.6384519128895693E+09, 1.5848048271166529E+10, 2.1031560281976665E+10, 1.5848048271166502E+10, 6.6384519128895674E+09, 1.4375660883001378E+09, 1.3701428307175812E+08, 3.9447202186642843E+06, 9.3397060605268125E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {4.0984512931817764E+04, 8.6828943763566799E+06, 1.9558432133067656E+08, 1.3674961320373521E+09, 3.9251291128182430E+09, 4.5116631434426517E+09, 4.8375356630808043E-07, -4.5116631434426460E+09, -3.9251291128182402E+09, -1.3674961320373492E+09, -1.9558432133067656E+08, -8.6828943763566278E+06, -4.0984512931817771E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {7.8379538318778985E+04, 8.4928073133582603E+06, 1.1992091153966437E+08, 5.0561697705436689E+08, 6.1845897311593950E+08, -5.1306326495404470E+08, -1.4790096327029374E+09, -5.1306326495404077E+08, 6.1845897311593986E+08, 5.0561697705436659E+08, 1.1992091153966436E+08, 8.4928073133582156E+06, 7.8379538318778927E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {8.6417670227040013E+04, 4.8250267333349697E+06, 3.9836803808039002E+07, 7.5026052902191013E+07, -7.7565422849560052E+07, -2.5393835488011825E+08, 5.1202971235247489E-07, 2.5393835488012013E+08, 7.7565422849558711E+07, -7.5026052902191967E+07, -3.9836803808039002E+07, -4.8250267333349511E+06, -8.6417670227039998E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {6.1161604972829380E+04, 1.7331203720075535E+06, 7.0216196997558968E+06, -3.6027138646117523E+06, -3.1775875626364492E+07, 1.6544480876790185E+06, 4.9816566960114852E+07, 1.6544480876808946E+06, -3.1775875626363728E+07, -3.6027138646113039E+06, 7.0216196997558847E+06, 1.7331203720075490E+06, 6.1161604972829351E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {2.9177164557155938E+04, 3.9318079134661221E+05, 3.1307448297760956E+05, -2.7571366584957433E+06, -9.8421840747392306E+05, 6.8469173866731795E+06, 2.9232946975263515E-06, -6.8469173866698397E+06, 9.8421840747792379E+05, 2.7571366584955421E+06, -3.1307448297758284E+05, -3.9318079134660971E+05, -2.9177164557155946E+04, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {9.5097815505886610E+03, 4.8799940773716655E+04, -1.2734023162441862E+05, -2.5472337176564379E+05, 6.3596049196278059E+05, 2.2361868201841635E+05, -1.0716559939651759E+06, 2.2361868202218774E+05, 6.3596049196161982E+05, -2.5472337176485342E+05, -1.2734023162441724E+05, 4.8799940773713337E+04, 9.5097815505886447E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {2.0601715730545379E+03, 1.9365931141472569E+02, -2.5304303117518622E+04, 2.9151392447034210E+04, 5.9055020355306144E+04, -1.1784846181665688E+05, 1.1400011168699383E-06, 1.1784846181507374E+05, -5.9055020356297522E+04, -2.9151392447480976E+04, 2.5304303117520958E+04, -1.9365931141621550E+02, -2.0601715730545466E+03, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {2.5975061893404052E+02, -1.0025387650583972E+03, -6.8642481194759603E+02, 6.7515314205452096E+03, -7.0772939650079616E+03, -6.5444514139847633E+03, 1.6566898963381227E+04, -6.5444514164662887E+03, -7.0772939638053231E+03, 6.7515314202341915E+03, -6.8642481198706810E+02, -1.0025387650556635E+03, 2.5975061893403893E+02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {5.8705282128634133E+00, -1.4424362302822419E+02, 3.3390627215295177E+02, 4.8151337640374301E+01, -1.1431733953039347E+03, 1.4557114789663567E+03, 1.9301282133401762E-06, -1.4557114797747520E+03, 1.1431733969207255E+03, -4.8151337212400264E+01, -3.3390627213809154E+02, 1.4424362302302313E+02, -5.8705282128808269E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {-4.0954969508936898E+00, -1.2634947188543673E+00, 3.8134139835466350E+01, -8.4115524781317148E+01, 4.2766848228448069E+01, 1.0573434411021174E+02, -1.9636661067694894E+02, 1.0573435394677749E+02, 4.2766846813968300E+01, -8.4115525213218916E+01, 3.8134139824669184E+01, -1.2634947158177201E+00, -4.0954969509055461E+00, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c11[] = {-6.2702735486285888E-01, 1.8595467772479546E+00, -1.3027978470952948E+00, -4.9265265903267785E+00, 1.3906831953385087E+01, -1.3753762586104637E+01, 1.0604155239584518E-06, 1.3753756761963198E+01, -1.3906831509501583E+01, 4.9265273268806409E+00, 1.3027978586801867E+00, -1.8595467797630916E+00, 6.2702735486047489E-01, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c12[] = {-4.8290636703364975E-02, 1.7531876505199090E-01, -5.0041292774701596E-01, 6.3665145473474949E-01, -1.2476811514471326E-02, -1.2061603189510861E+00, 1.8595308638696268E+00, -1.2061633355215959E+00, -1.2475969680262359E-02, 6.3665088474340670E-01, -5.0041295405456876E-01, 1.7531876799797264E-01, -4.8290636708721864E-02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c13[] = {2.2894665617766322E-02, -7.1358257229878720E-03, -1.4950743217821900E-02, 7.0611745711086651E-02, -1.2311302279978055E-01, 1.0342573392772816E-01, 5.7346192890547669E-07, -1.0342709034448951E-01, 1.2311300937219723E-01, -7.0611830251417942E-02, 1.4950741891648016E-02, 7.1358203725587141E-03, -2.2894665628191136E-02, 0.0000000000000000E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i]))))))))))))); - } else if (w==14) { - FLT c0[] = {1.3368785683552904E+04, 7.5304732752870144E+06, 3.2765764524434990E+08, 4.2418096936485257E+09, 2.4197690538177525E+10, 7.2227640697189651E+10, 1.2261475327356714E+11, 1.2261475327356711E+11, 7.2227640697189682E+10, 2.4197690538177582E+10, 4.2418096936485257E+09, 3.2765764524435169E+08, 7.5304732752870200E+06, 1.3368785683578039E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c1[] = {6.1154444023081669E+04, 1.7488686085101541E+07, 5.0279014009863263E+08, 4.4777867842655849E+09, 1.6916819861812059E+10, 2.8971884004562843E+10, 1.6054555293734524E+10, -1.6054555293734529E+10, -2.8971884004562843E+10, -1.6916819861812090E+10, -4.4777867842655830E+09, -5.0279014009863406E+08, -1.7488686085101560E+07, -6.1154444023056145E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c2[] = {1.2279790808348049E+05, 1.8230319600271538E+07, 3.3815815633683985E+08, 1.9369899011251254E+09, 3.9743454154781203E+09, 7.4954544638351786E+08, -7.0173920607395000E+09, -7.0173920607395000E+09, 7.4954544638351130E+08, 3.9743454154781117E+09, 1.9369899011251252E+09, 3.3815815633684093E+08, 1.8230319600271557E+07, 1.2279790808350699E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c3[] = {1.4339321200624766E+05, 1.1200899688172188E+07, 1.2799140125169712E+08, 4.0176966726270604E+08, 7.9146174555810899E+07, -1.1719748245183561E+09, -9.6919138198233843E+08, 9.6919138198235476E+08, 1.1719748245183618E+09, -7.9146174555819452E+07, -4.0176966726270568E+08, -1.2799140125169776E+08, -1.1200899688172201E+07, -1.4339321200622554E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c4[] = {1.0866548538632700E+05, 4.4565213401510641E+06, 2.8354150929531462E+07, 2.2805067924009934E+07, -1.2058223609889300E+08, -1.2775415620368913E+08, 1.9261201640091014E+08, 1.9261201640090343E+08, -1.2775415620368628E+08, -1.2058223609888241E+08, 2.2805067924009915E+07, 2.8354150929531943E+07, 4.4565213401510660E+06, 1.0866548538635390E+05, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c5[] = {5.6346565047794407E+04, 1.1743908345502375E+06, 3.0601086667309003E+06, -7.2274020134796975E+06, -1.6220595157143334E+07, 2.0773587344466623E+07, 2.8183198298701070E+07, -2.8183198298682313E+07, -2.0773587344454899E+07, 1.6220595157147046E+07, 7.2274020134809064E+06, -3.0601086667310768E+06, -1.1743908345502312E+06, -5.6346565047771022E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c6[] = {2.0435142564639598E+04, 1.9450977300078847E+05, -1.1234667576926883E+05, -1.5205767549240857E+06, 1.0515640561047094E+06, 3.7458351782500809E+06, -3.3794074240119159E+06, -3.3794074240111569E+06, 3.7458351782506104E+06, 1.0515640561079446E+06, -1.5205767549239916E+06, -1.1234667576914738E+05, 1.9450977300078212E+05, 2.0435142564663307E+04, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c7[] = {5.1491366053560478E+03, 1.4735748500440239E+04, -8.1689482343683034E+04, -3.5176894225644079E+04, 3.7034248410400847E+05, -1.9109669530460562E+05, -5.2637978465735121E+05, 5.2637978465564619E+05, 1.9109669530912716E+05, -3.7034248412078863E+05, 3.5176894225852200E+04, 8.1689482343699274E+04, -1.4735748500439855E+04, -5.1491366053330485E+03, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c8[] = {8.5138795113645585E+02, -1.2978618911733427E+03, -8.7500873646623440E+03, 2.1319159613970569E+04, 7.6586611605801199E+03, -6.2424139811455236E+04, 4.2620771487921840E+04, 4.2620771491440872E+04, -6.2424139815176597E+04, 7.6586611693937375E+03, 2.1319159613447209E+04, -8.7500873648877496E+03, -1.2978618911701635E+03, 8.5138795115875257E+02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c9[] = {7.2176142041616245E+01, -4.5543406155008586E+02, 2.8301959891624585E+02, 2.1994171513769957E+03, -4.5082500677203352E+03, 4.7658016853354945E+02, 7.1044827209848581E+03, -7.1044827023442112E+03, -4.7658015978385805E+02, 4.5082500694322307E+03, -2.1994171506161529E+03, -2.8301959873197922E+02, 4.5543406154525627E+02, -7.2176142022451799E+01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c10[] = {-3.1135380163286266E+00, -3.8554406982628045E+01, 1.4396028111579378E+02, -1.1260050352192819E+02, -3.0073665460436297E+02, 7.2079162225452933E+02, -4.1195308319958349E+02, -4.1195308907344031E+02, 7.2079162228692246E+02, -3.0073665296314113E+02, -1.1260050391063737E+02, 1.4396028095922969E+02, -3.8554406981953719E+01, -3.1135379980309104E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c11[] = {-1.6022934776950781E+00, 1.8678197421257499E+00, 8.3368944138930576E+00, -3.0791578217513287E+01, 3.4749712345962102E+01, 1.2322522680262193E+01, -7.3924006859338746E+01, 7.3924005395986399E+01, -1.2322518095091780E+01, -3.4749717239655702E+01, 3.0791578812609753E+01, -8.3368942651188451E+00, -1.8678197375527952E+00, 1.6022934952009980E+00, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c12[] = {-1.9362061840948824E-01, 6.3024467669748396E-01, -9.3262278519229969E-01, -4.8908749318740480E-01, 4.0479376609320967E+00, -6.2829712900962678E+00, 3.1767825933699174E+00, 3.1767865219197975E+00, -6.2829777441520323E+00, 4.0479394849078085E+00, -4.8908801933495105E-01, -9.3262306580362497E-01, 6.3024467258732675E-01, -1.9362060312142931E-01, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c13[] = {1.8785913718903639E-02, 3.1605271252714680E-02, -1.3655798291459853E-01, 2.5016547139148904E-01, -1.6654308552073466E-01, -2.1682598043284024E-01, 6.1786085249849709E-01, -6.1785470804340159E-01, 2.1682794765059335E-01, 1.6654258378326353E-01, -2.5016523395036322E-01, 1.3655803190024704E-01, -3.1605272440421092E-02, -1.8785905282938619E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - FLT c14[] = {-1.2896545140952162E-02, -3.7106972352948116E-03, 5.8857860695711909E-04, 1.3987176343065890E-02, -3.5714007561179102E-02, 4.3401590960273219E-02, -2.0034532372716081E-02, -2.0038454375630149E-02, 4.3401322628411031E-02, -3.5713348533616053E-02, 1.3987046090052241E-02, 5.8856319054218355E-04, -3.7106979912720915E-03, -1.2896537385752806E-02, 0.0000000000000000E+00, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i])))))))))))))); - } else if (w==15) { - FLT c0[] = {1.8887777774374499E+04, 1.4015330434461417E+07, 7.5498683300180018E+08, 1.1900937739619951E+10, 8.2530965279375351E+10, 3.0178246269069604E+11, 6.3775691457119104E+11, 8.1471473119305554E+11, 6.3775691457119116E+11, 3.0178246269069641E+11, 8.2530965279375519E+10, 1.1900937739619963E+10, 7.5498683300180054E+08, 1.4015330434461435E+07, 1.8887777774374488E+04, 0.0000000000000000E+00}; - FLT c1[] = {8.9780907163796335E+04, 3.4167636285297148E+07, 1.2346880033823481E+09, 1.3719272724135921E+10, 6.5858241494816696E+10, 1.5266999939989539E+11, 1.5687794513790723E+11, -2.8523584844088883E-05, -1.5687794513790732E+11, -1.5266999939989545E+11, -6.5858241494816811E+10, -1.3719272724135933E+10, -1.2346880033823476E+09, -3.4167636285297163E+07, -8.9780907163796335E+04, 0.0000000000000000E+00}; - FLT c2[] = {1.8850321233130712E+05, 3.7693640983013541E+07, 8.9846818051570034E+08, 6.7094088040439653E+09, 1.9743296615199215E+10, 1.8072727219391140E+10, -2.0634615374559410E+10, -4.9654335197177498E+10, -2.0634615374559414E+10, 1.8072727219391048E+10, 1.9743296615199223E+10, 6.7094088040439672E+09, 8.9846818051570022E+08, 3.7693640983013526E+07, 1.8850321233130703E+05, 0.0000000000000000E+00}; - FLT c3[] = {2.3185006533495727E+05, 2.4789475362741601E+07, 3.7751696829092383E+08, 1.7167916788178182E+09, 1.9832401267745295E+09, -3.4881359830884194E+09, -7.8785602379628601E+09, 6.6906528952995499E-05, 7.8785602379629536E+09, 3.4881359830884261E+09, -1.9832401267745163E+09, -1.7167916788178096E+09, -3.7751696829092425E+08, -2.4789475362741597E+07, -2.3185006533495730E+05, 0.0000000000000000E+00}; - FLT c4[] = {1.8672970114818285E+05, 1.0741068109706732E+07, 9.8017949708492473E+07, 2.0291084954252145E+08, -2.7857869294214898E+08, -9.4112677968756318E+08, 1.7886520649334356E+08, 1.4579673547891481E+09, 1.7886520649344125E+08, -9.4112677968753338E+08, -2.7857869294217581E+08, 2.0291084954251301E+08, 9.8017949708492488E+07, 1.0741068109706739E+07, 1.8672970114818282E+05, 0.0000000000000000E+00}; - FLT c5[] = {1.0411891611891470E+05, 3.1771463075269456E+06, 1.4880104152842037E+07, -6.8136965447538150E+06, -8.7072998215422541E+07, 1.8024116530863210E+06, 1.9067730799615666E+08, 1.2078175959365315E-04, -1.9067730799603686E+08, -1.8024116529155241E+06, 8.7072998215445980E+07, 6.8136965447565373E+06, -1.4880104152841812E+07, -3.1771463075269484E+06, -1.0411891611891470E+05, 0.0000000000000000E+00}; - FLT c6[] = {4.1300641422694731E+04, 6.3217168592497683E+05, 7.7343707634845132E+05, -5.4575962381476769E+06, -3.7387211063063843E+06, 1.8451583614082869E+07, 3.0480804948189310E+06, -2.7500445095872246E+07, 3.0480804948457484E+06, 1.8451583614064269E+07, -3.7387211062890980E+06, -5.4575962381450543E+06, 7.7343707634841127E+05, 6.3217168592497602E+05, 4.1300641422694724E+04, 0.0000000000000000E+00}; - FLT c7[] = {1.1710443348523711E+04, 7.5405449195716908E+04, -1.6634736996487752E+05, -5.6069290801842115E+05, 1.1540571563940533E+06, 1.0209821660925965E+06, -2.9641921942009293E+06, -7.3770236318814628E-06, 2.9641921942630685E+06, -1.0209821662946860E+06, -1.1540571563987043E+06, 5.6069290801928868E+05, 1.6634736996459437E+05, -7.5405449195719295E+04, -1.1710443348523739E+04, 0.0000000000000000E+00}; - FLT c8[] = {2.3142324239350210E+03, 2.1710560541703007E+03, -3.6929625713151705E+04, 2.6143898219588682E+04, 1.4046980090353978E+05, -2.1033190114896413E+05, -1.1132269819276403E+05, 3.7491447373940505E+05, -1.1132269820720138E+05, -2.1033190120894444E+05, 1.4046980085134835E+05, 2.6143898217223435E+04, -3.6929625713258414E+04, 2.1710560541651053E+03, 2.3142324239349791E+03, 0.0000000000000000E+00}; - FLT c9[] = {2.8879718294281940E+02, -9.2801372612866078E+02, -1.9817144428357562E+03, 9.9004179214302640E+03, -5.7928268996319048E+03, -2.1083466266548403E+04, 3.3285502001854453E+04, 1.3615676123196788E-04, -3.3285501884684672E+04, 2.1083466388283239E+04, 5.7928269528908959E+03, -9.9004179214302640E+03, 1.9817144428357562E+03, 9.2801372612624596E+02, -2.8879718294281940E+02, 0.0000000000000000E+00}; - FLT c10[] = {1.3121871131759899E+01, -1.5978845118014243E+02, 2.7429718889479011E+02, 4.4598059431432415E+02, -1.8917609556521720E+03, 1.5303002256342920E+03, 1.7542368404254241E+03, -3.9411530187890685E+03, 1.7542368839611659E+03, 1.5303002335812619E+03, -1.8917609760379448E+03, 4.4598059250034765E+02, 2.7429718872202716E+02, -1.5978845118149314E+02, 1.3121871131760223E+01, 0.0000000000000000E+00}; - FLT c11[] = {-2.4286151057622600E+00, -6.7839829150137421E+00, 4.6999223003107119E+01, -7.4896070454665107E+01, -3.2010110856873055E+01, 2.5022929107925501E+02, -2.8786053481345135E+02, 1.4424367379967129E-05, 2.8786057555317575E+02, -2.5022937123192844E+02, 3.2010139421505684E+01, 7.4896073537460509E+01, -4.6999223012862650E+01, 6.7839829186720362E+00, 2.4286151057336860E+00, 0.0000000000000000E+00}; - FLT c12[] = {-5.4810555665671257E-01, 1.1436870859674571E+00, 8.2471504792547190E-01, -8.5602131787584241E+00, 1.5631631237511966E+01, -6.4979395997142886E+00, -1.8737629118679905E+01, 3.3283673647767003E+01, -1.8737705444926284E+01, -6.4980552114725620E+00, 1.5631576798962341E+01, -8.5602158445716778E+00, 8.2471481116140977E-01, 1.1436870769250529E+00, -5.4810555667406624E-01, 0.0000000000000000E+00}; - FLT c13[] = {-1.4554612891837512E-02, 1.7022157398269799E-01, -3.7563892964814216E-01, 2.0131145240492249E-01, 8.3554123561642435E-01, -2.1191317631421946E+00, 1.9961007770939201E+00, 5.0230495487029605E-05, -1.9960655197919825E+00, 2.1191435815870405E+00, -8.3552330614378623E-01, -2.0131363341395125E-01, 3.7563890238546094E-01, -1.7022157734534860E-01, 1.4554612875194470E-02, 0.0000000000000000E+00}; - FLT c14[] = {-1.2348455978815665E-02, 2.6143485494326945E-03, -2.9252290291144727E-02, 7.5392101552106419E-02, -8.7986538697867239E-02, 1.3073120666751545E-03, 1.5251801232957554E-01, -2.3235618419546245E-01, 1.5253703942622115E-01, 1.3217162898956957E-03, -8.7999818995735196E-02, 7.5391507930594778E-02, -2.9252395603998178E-02, 2.6143483927929994E-03, -1.2348455970768767E-02, 0.0000000000000000E+00}; - FLT c15[] = {1.4214685591273772E-02, -1.2364346992375923E-03, 1.2892328724708124E-03, 1.6178725688327468E-03, -8.2104229475896996E-03, 1.3914679473447157E-02, -1.1426959041713501E-02, 1.6590583007947697E-05, 1.1446333966460217E-02, -1.3912124902889801E-02, 8.2298310485774198E-03, -1.6155336438419190E-03, -1.2892162843503102E-03, 1.2364372911314208E-03, -1.4214685607473108E-02, 0.0000000000000000E+00}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i]))))))))))))))); - } else if (w==16) { - FLT c0[] = {2.6374086784014689E+04, 2.5501413681212645E+07, 1.6835469840840099E+09, 3.1953580806547867E+10, 2.6584910126662766E+11, 1.1715858191494619E+12, 3.0181658330343120E+12, 4.7888775408612773E+12, 4.7888775408612764E+12, 3.0181658330343125E+12, 1.1715858191494619E+12, 2.6584910126662772E+11, 3.1953580806547874E+10, 1.6835469840840104E+09, 2.5501413681212656E+07, 2.6374086784014886E+04}; - FLT c1[] = {1.2991568388123445E+05, 6.4986154651133664E+07, 2.9142305012947259E+09, 3.9748054433728149E+10, 2.3649443248440247E+11, 7.0471088240421252E+11, 1.0533888905987031E+12, 5.4832304482297632E+11, -5.4832304482297687E+11, -1.0533888905987034E+12, -7.0471088240421265E+11, -2.3649443248440250E+11, -3.9748054433728149E+10, -2.9142305012947259E+09, -6.4986154651133649E+07, -1.2991568388123448E+05}; - FLT c2[] = {2.8421223836872831E+05, 7.5448503558118582E+07, 2.2710828032883868E+09, 2.1491603403163826E+10, 8.4299374042308136E+10, 1.3384457365769528E+11, 1.8630012765531485E+09, -2.4384536789321179E+11, -2.4384536789321094E+11, 1.8630012765532806E+09, 1.3384457365769531E+11, 8.4299374042308090E+10, 2.1491603403163826E+10, 2.2710828032883863E+09, 7.5448503558118552E+07, 2.8421223836872820E+05}; - FLT c3[] = {3.6653021243297518E+05, 5.2693428548387080E+07, 1.0410094433021281E+09, 6.3986267576853533E+09, 1.3313926739756302E+10, -2.7909761561128025E+09, -3.9911638977027977E+10, -2.9236947704012939E+10, 2.9236947704012939E+10, 3.9911638977028267E+10, 2.7909761561128430E+09, -1.3313926739756279E+10, -6.3986267576853561E+09, -1.0410094433021276E+09, -5.2693428548387088E+07, -3.6653021243297518E+05}; - FLT c4[] = {3.1185660915838118E+05, 2.4564274645530280E+07, 3.0509279143241835E+08, 1.0432225146182569E+09, 6.4966284440222360E+07, -4.2483903608016477E+09, -3.1778261722524829E+09, 5.9880587942832708E+09, 5.9880587942832832E+09, -3.1778261722526174E+09, -4.2483903608017979E+09, 6.4966284440235756E+07, 1.0432225146182607E+09, 3.0509279143241805E+08, 2.4564274645530272E+07, 3.1185660915838124E+05}; - FLT c5[] = {1.8544733523229562E+05, 7.9824949938292839E+06, 5.6880943382648192E+07, 5.4097201999258779E+07, -3.0776449202833223E+08, -3.7659931821867347E+08, 6.8797698944719648E+08, 7.5429896889866996E+08, -7.5429896889781320E+08, -6.8797698944658160E+08, 3.7659931821898031E+08, 3.0776449202837497E+08, -5.4097201999252096E+07, -5.6880943382647842E+07, -7.9824949938292857E+06, -1.8544733523229562E+05}; - FLT c6[] = {7.9472339236673259E+04, 1.8159676553648398E+06, 5.7259818806751696E+06, -1.2786136236423338E+07, -3.8677490873147681E+07, 4.7651450515707508E+07, 9.0723760109202415E+07, -9.4532949239946112E+07, -9.4532949239604995E+07, 9.0723760109522834E+07, 4.7651450515667401E+07, -3.8677490873160362E+07, -1.2786136236416934E+07, 5.7259818806752721E+06, 1.8159676553648538E+06, 7.9472339236673215E+04}; - FLT c7[] = {2.4831718998299857E+04, 2.7536301841716090E+05, -5.1045953356025166E+04, -2.6996387880239477E+06, 1.1656554632125401E+06, 9.1521923449522462E+06, -6.8198180925621921E+06, -1.2555197000954127E+07, 1.2555197001087580E+07, 6.8198180925775450E+06, -9.1521923449367471E+06, -1.1656554632051867E+06, 2.6996387880183556E+06, 5.1045953355832869E+04, -2.7536301841717580E+05, -2.4831718998299897E+04}; - FLT c8[] = {5.6060763597396035E+03, 2.2154740880101843E+04, -1.0243462874810334E+05, -1.1802198892388590E+05, 6.4061699367506150E+05, -1.1166716749369531E+05, -1.4153578101923370E+06, 1.0790712965214122E+06, 1.0790712965802078E+06, -1.4153578102569627E+06, -1.1166716767280686E+05, 6.4061699367841065E+05, -1.1802198892652121E+05, -1.0243462874831920E+05, 2.2154740880096295E+04, 5.6060763597396262E+03}; - FLT c9[] = {8.7271993222049730E+02, -7.0074676859193858E+02, -1.2528372958474913E+04, 2.3643101054370443E+04, 3.1699060146436736E+04, -1.1270133578294520E+05, 3.6872846840416030E+04, 1.5168911768972370E+05, -1.5168911672801850E+05, -3.6872846329129716E+04, 1.1270133600206790E+05, -3.1699060140349993E+04, -2.3643101053229180E+04, 1.2528372958403583E+04, 7.0074676858840917E+02, -8.7271993222049730E+02}; - FLT c10[] = {7.8842259458727298E+01, -4.2070880913717718E+02, -1.0535142166729695E+02, 3.3375056757602101E+03, -4.9426353709826744E+03, -3.6567309465694352E+03, 1.5199085032737788E+04, -9.4972226150681072E+03, -9.4972224492176338E+03, 1.5199085307902486E+04, -3.6567309714471071E+03, -4.9426353751288962E+03, 3.3375056795609726E+03, -1.0535142205602271E+02, -4.2070880913447866E+02, 7.8842259458701932E+01}; - FLT c11[] = {8.9833076760252317E-02, -4.4163371177310189E+01, 1.2880771175011134E+02, 2.8722208980881483E+00, -5.7164632401064989E+02, 9.0417621054583299E+02, 1.1221311957018894E+00, -1.4190922684153286E+03, 1.4190926436578332E+03, -1.1219382673482139E+00, -9.0417616902565715E+02, 5.7164633587355513E+02, -2.8722219907225899E+00, -1.2880771149646372E+02, 4.4163371174871045E+01, -8.9833076793553943E-02}; - FLT c12[] = {-1.0900468357304585E+00, -1.1264666580175993E-01, 1.1810668498718398E+01, -3.0289105594116332E+01, 1.5494599855921946E+01, 6.0130016326899806E+01, -1.2330195579557967E+02, 6.7114292010484860E+01, 6.7114238133033894E+01, -1.2330200967294053E+02, 6.0129899592769000E+01, 1.5494588631452897E+01, -3.0289108821162568E+01, 1.1810668060273379E+01, -1.1264668224327026E-01, -1.0900468357482698E+00}; - FLT c13[] = {-1.1763610124684608E-01, 4.2939195551308978E-01, -2.7950231695310290E-01, -1.7354597875532083E+00, 5.1181749794184972E+00, -5.0538409872852545E+00, -2.1268758321444312E+00, 1.0709572497394593E+01, -1.0709247944735344E+01, 2.1270284132327628E+00, 5.0538814533614023E+00, -5.1181783143082038E+00, 1.7354587260576941E+00, 2.7950208340719496E-01, -4.2939195720020440E-01, 1.1763610121354666E-01}; - FLT c14[] = {-1.8020499708490779E-02, 3.6694576081450124E-02, -1.1331174689418615E-01, 1.3970801507325420E-01, 8.1708800731612838E-02, -5.4465632012605969E-01, 7.9628723318194716E-01, -3.9045387765910361E-01, -3.9034731591396871E-01, 7.9641679205120786E-01, -5.4465236519348836E-01, 8.1709687544577886E-02, 1.3970913694934384E-01, -1.1331198385459386E-01, 3.6694575058947500E-02, -1.8020499699434717E-02}; - FLT c15[] = {1.4589783457723899E-02, -7.8885273589694921E-04, -4.4854775481901451E-03, 1.8117810622567232E-02, -3.0563678378015532E-02, 1.9027105036022670E-02, 2.4778670881552757E-02, -6.7767913155521747E-02, 6.7979444868167399E-02, -2.4638534439549119E-02, -1.8992900331546877E-02, 3.0569915511324409E-02, -1.8117279802711158E-02, 4.4857097818771776E-03, 7.8885377265448060E-04, -1.4589783469873403E-02}; - FLT c16[] = {-1.0467998068898355E-02, -3.2140568385029999E-04, 5.2979866592800886E-04, -1.5800624712947203E-04, -1.4200041949817279E-03, 3.7626007108648857E-03, -3.8348321381240775E-03, 1.6547563335740942E-03, 1.5759584129276946E-03, -3.8873640852216617E-03, 3.7166352571544989E-03, -1.4265706883689335E-03, -1.5923746463956793E-04, 5.2952292450647511E-04, -3.2141610431099765E-04, -1.0467998084554094E-02}; - for (int i=0; i<16; i++) ker[i] = c0[i] + z*(c1[i] + z*(c2[i] + z*(c3[i] + z*(c4[i] + z*(c5[i] + z*(c6[i] + z*(c7[i] + z*(c8[i] + z*(c9[i] + z*(c10[i] + z*(c11[i] + z*(c12[i] + z*(c13[i] + z*(c14[i] + z*(c15[i] + z*(c16[i])))))))))))))))); - } else - printf("width not implemented!\n"); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index e568c920e..c4df21207 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -26,17 +26,17 @@ struct zip_low; struct zip_hi; // forward declaration to clean up the code and be able to use this everywhere in the file template static constexpr auto BestSIMDHelper(); -template constexpr auto GetPaddedSIMDSize(); +template constexpr auto GetPaddedSIMDWidth(); template -using PaddedSIMD = typename xsimd::make_sized_batch()>::type; +using PaddedSIMD = typename xsimd::make_sized_batch()>::type; template uint8_t get_padding(uint8_t ns); template constexpr auto get_padding(); template using BestSIMD = typename decltype(BestSIMDHelper::size>())::type; -template constexpr uint8_t min_batch_size(); -template constexpr auto find_optimal_batch_size(); +template constexpr uint8_t min_simd_width(); +template constexpr auto find_optimal_simd_width(); template -constexpr auto initialize_complex_batch(V a, V b) noexcept; +constexpr auto initialize_complex_register(V a, V b) noexcept; template constexpr auto zip_low_index = xsimd::make_batch_constant, arch_t, zip_low>(); @@ -53,7 +53,7 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset } // namespace // declarations of purely internal functions... (thus need not be in .h) template()>, + class simd_type = xsimd::make_sized_batch_t()>, typename... V> static auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept; @@ -62,8 +62,8 @@ static FINUFFT_ALWAYS_INLINE void set_kernel_args( FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; static FINUFFT_ALWAYS_INLINE void evaluate_kernel_vector( FLT *ker, FLT *args, const finufft_spread_opts &opts, int N) noexcept; -template()>> // aka ns +template()>> // aka ns static FINUFFT_ALWAYS_INLINE void eval_kernel_vec_Horner( FLT *FINUFFT_RESTRICT ker, FLT x, const finufft_spread_opts &opts) noexcept; template @@ -377,9 +377,9 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat { // local copies of NU pts and data for each subproblem std::vector kx0{0}, ky0{0}, kz0{0}, dd0{0}, du0{0}; -#pragma omp for schedule(dynamic, 1) // each is big - for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems - BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem +#pragma omp for schedule(dynamic, 1) // each is big + for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems + const BIGINT M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem // copy the location and data vectors for the nonuniform points kx0.resize(M0); ky0.resize(M0 * (N2 > 1)); @@ -447,11 +447,11 @@ int interpSorted_kernel(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT // Interpolate to NU pts in sorted order from a uniform grid. // See spreadinterp() for doc. { - using batch_t = xsimd::batch; - using arch_t = typename batch_t::arch_type; + using simd_type = xsimd::batch; + using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto alignment = batch_t::arch_type::alignment(); - static constexpr auto avx_size = batch_t::size; + static constexpr auto alignment = simd_type::arch_type::alignment(); + static constexpr auto simd_size = simd_type::size; static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift CNTime timer; @@ -739,7 +739,7 @@ void evaluate_kernel_vector(FLT *ker, FLT *args, const finufft_spread_opts &opts if (abs(args[i]) >= (FLT)opts.ES_halfwidth) ker[i] = 0.0; } -template // aka ns +template // aka ns void eval_kernel_vec_Horner(FLT *FINUFFT_RESTRICT ker, const FLT x, const finufft_spread_opts &opts) noexcept /* Fill ker[] with Horner piecewise poly approx to [-w/2,w/2] ES kernel eval at @@ -751,21 +751,21 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ const FLT z = std::fma(FLT(2.0), x, FLT(w - 1)); // scale so local grid offset z in // [-1,1] if (opts.upsampfac == 2.0) { // floating point equality is fine here - static constexpr auto alignment = batch_t::arch_type::alignment(); - static constexpr auto avx_size = batch_t::size; - static constexpr auto padded_ns = (w + avx_size - 1) & ~(avx_size - 1); + static constexpr auto alignment = simd_type::arch_type::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto padded_ns = (w + simd_size - 1) & ~(simd_size - 1); static constexpr auto nc = nc200(); static constexpr auto horner_coeffs = get_horner_coeffs_200(); alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); - const auto zv = batch_t(z); + const auto zv = simd_type(z); - for (uint8_t i = 0; i < w; i += avx_size) { - auto k = batch_t::load_aligned(padded_coeffs[0].data() + i); + for (uint8_t i = 0; i < w; i += simd_size) { + auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); for (uint8_t j = 1; j < nc; ++j) { - const auto cji = batch_t::load_aligned(padded_coeffs[j].data() + i); + const auto cji = simd_type::load_aligned(padded_coeffs[j].data() + i); k = xsimd::fma(k, zv, cji); } k.store_aligned(ker + i); @@ -1028,11 +1028,11 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( This needed off1 as extra arg. AHB 11/30/20. Vectorized using xsimd by M. Barbone 06/24. */ - using batch_t = PaddedSIMD; - using arch_t = typename batch_t::arch_type; + using simd_type = PaddedSIMD; + using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto alignment = batch_t::arch_type::alignment(); - static constexpr auto avx_size = batch_t::size; + static constexpr auto alignment = arch_t::alignment(); + static constexpr auto simd_size = simd_type::size; static constexpr auto ns2 = ns * FLT(0.5); // half spread width // something weird here. Reversing ker{0} and std fill causes ker // to be zeroed inside the loop GCC uses AVX, clang AVX2 @@ -1048,59 +1048,69 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // but is a hint to the compiler that after the lambda // dd_pt is not modified and can be kept as is in a register // given (re, im) in this case dd[i*2] and dd[i*2+1] - // this function returns a simd register of size avx_size + // this function returns a simd register of size simd_size // initialized as follows: // +-----------------------+ // |re|im|re|im|re|im|re|im| // +-----------------------+ - const auto dd_pt = initialize_complex_batch(dd[i * 2], dd[i * 2 + 1]); + const auto dd_pt = initialize_complex_register(dd[i * 2], dd[i * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = BIGINT(std::ceil(kx[i] - ns2)); // fine grid start index // FLT(i1) has different semantics and results an extra cast - auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding - // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly - // kernel evaluation will fall outside their designed domains, >>1 errors. - // This can only happen if the overall error would be O(1) anyway. Clip x1?? - if (x1 < -ns2) x1 = -ns2; - if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** - ker_eval(ker.data(), opts, x1); - // const auto ker = ker_eval(opts, x1); + const auto x1 = [i, kx]() constexpr noexcept { + auto x1 = std::ceil(kx[i] - ns2) - kx[i]; // x1 in [-w/2,-w/2+1], up to rounding + // However if N1*epsmach>O(1) then can cause O(1) errors in x1, hence ppoly + // kernel evaluation will fall outside their designed domains, >>1 errors. + // This can only happen if the overall error would be O(1) anyway. Clip x1?? + if (x1 < -ns2) x1 = -ns2; + if (x1 > -ns2 + 1) x1 = -ns2 + 1; // *** + return x1; + }(); + // Libin improvement: pass ker as a parameter and allocate it outside the loop + // gcc13 + 10% speedup + ker_eval(ker.data(), opts, x1); + // const auto ker = ker_eval(opts, x1); const auto j = i1 - off1; // offset rel to subgrid, starts the output indices auto *FINUFFT_RESTRICT trg = du + 2 * j; // restrict helps compiler to vectorize // du is padded, so we can use SIMD even if we write more than ns values in du // ker is also padded. - // regular_part is the largest multiple of 2*ns minus the remainder modulo - // (2*avx_size). This allows to save one load. - // see below for the details. - // adding padding to guarantee that all the elements are computed - // this trick only works when avx_size is a power of 2 - // avx_size*2 is guaranteed to be a power of 2, trivially - static constexpr auto regular_part = (2 * ns + padding) & (-(2 * avx_size)); - // this loop increment is 2*avx_size by design - // it allows to save one load this way - // this does for each element e of the subgrid, x1 defined above and pt the NU point + // regular_part, source Agner Fog + // [VCL](https://www.agner.org/optimize/vcl_manual.pdf) + // Given 2*ns+padding=L so that L = M*simd_size + // if M is even then regular_part == M else regular_part == (M-1) * simd_size + // this means that the elements from regular_part to L are a special case that + // needs a different handling. These last elements are not computed in the loop but, + // the if constexpr block at the end of the loop takes care of them. + // This allows to save one load at each loop iteration. + // The special case, allows to minimize padding otherwise out of bounds access. + // See below for the details. + static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); + // this loop increment is 2*simd_size by design + // it allows to save one load this way at each iteration + + // This does for each element e of the subgrid, x1 defined above and pt the NU point // the following: e += exp(beta.sqrt(1 - (2*x1/n_s)^2))*pt // NOTE: x1 is translated accordingly, please see the ES method for more // using uint8_t in loops to favor unrolling. // Most compilers limit the unrolling to 255, uint8_t is at most 255 - for (uint8_t dx{0}; dx < regular_part; dx += 2 * avx_size) { - // read ker01 which is avx_size wide from ker - // ker01 looks like this: + for (uint8_t dx{0}; dx < regular_part; dx += 2 * simd_size) { + // read ker_v which is simd_size wide from ker + // ker_v looks like this: // +-----------------------+ // |y0|y1|y2|y3|y4|y5|y6|y7| // +-----------------------+ - const auto ker01 = batch_t::load_aligned(ker.data() + dx / 2); + const auto ker_v = simd_type::load_aligned(ker.data() + dx / 2); // read 2*SIMD vectors from the subproblem grid - const auto du_pt0 = batch_t::load_unaligned(trg + dx); - const auto du_pt1 = batch_t::load_unaligned(trg + dx + avx_size); - // swizzle is faster than zip_lo(ker01, ker01) and zip_hi(ker01, ker01) + const auto du_pt0 = simd_type::load_unaligned(trg + dx); + const auto du_pt1 = simd_type::load_unaligned(trg + dx + simd_size); + // swizzle is faster than zip_lo(ker_v, ker_v) and zip_hi(ker_v, ker_v) // swizzle in this case is equivalent to zip_lo and zip_hi respectively - const auto ker0 = xsimd::swizzle(ker01, zip_low_index); + const auto ker0low = xsimd::swizzle(ker_v, zip_low_index); // ker 0 looks like this now: // +-----------------------+ // |y0|y0|y1|y1|y2|y2|y3|y3| // +-----------------------+ - const auto ker1 = xsimd::swizzle(ker01, zip_hi_index); + const auto ker0hi = xsimd::swizzle(ker_v, zip_hi_index); // ker 1 looks like this now: // +-----------------------+ // |y4|y4|y5|y5|y6|y6|y7|y7| @@ -1108,25 +1118,27 @@ FINUFFT_NEVER_INLINE void spread_subproblem_1d_kernel( // same as before each element of the subproblem grid is multiplied by the // corresponding element of the kernel since dd_pt is re|im interleaves res0 is also // correctly re|im interleaved - // doing this for two SIMD vectors at once allows to fully utilize ker01 instead of + // doing this for two SIMD vectors at once allows to fully utilize ker_v instead of // wasting the higher half - const auto res0 = xsimd::fma(ker0, dd_pt, du_pt0); - const auto res1 = xsimd::fma(ker1, dd_pt, du_pt1); + const auto res0 = xsimd::fma(ker0low, dd_pt, du_pt0); + const auto res1 = xsimd::fma(ker0hi, dd_pt, du_pt1); res0.store_unaligned(trg + dx); - res1.store_unaligned(trg + dx + avx_size); + res1.store_unaligned(trg + dx + simd_size); } // sanity check at compile time that all the elements are computed - static_assert(regular_part + avx_size >= 2 * ns); - // case where the 2*ns is not a multiple of 2*avx_size + static_assert(regular_part + simd_size >= 2 * ns); + // case where the 2*ns is not a multiple of 2*simd_size // checking 2*ns instead of 2*ns+padding as we do not need to compute useless zeros... if constexpr (regular_part < 2 * ns) { // here we need to load the last kernel values, // but we can avoid computing extra padding // also this padding will result in out-of-bounds access to trg - const auto ker01 = batch_t::load_unaligned(ker.data() + (regular_part / 2)); - const auto du_pt = batch_t::load_unaligned(trg + regular_part); - const auto ker0 = xsimd::swizzle(ker01, zip_low_index); - const auto res = xsimd::fma(ker0, dd_pt, du_pt); + // The difference between this and the loop is that ker0hi is not computed and + // the corresponding memory is not accessed + const auto ker0 = simd_type::load_unaligned(ker.data() + (regular_part / 2)); + const auto du_pt = simd_type::load_unaligned(trg + regular_part); + const auto ker0low = xsimd::swizzle(ker0, zip_low_index); + const auto res = xsimd::fma(ker0low, dd_pt, du_pt); res.store_unaligned(trg + regular_part); } } @@ -1213,55 +1225,81 @@ FINUFFT_NEVER_INLINE static void spread_subproblem_2d_kernel( For algoritmic details see spread_subproblem_1d_kernel. */ { - using batch_t = PaddedSIMD; - using arch_t = typename batch_t::arch_type; + using simd_type = PaddedSIMD; + using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto avx_size = batch_t::size; - static constexpr auto alignment = batch_t::arch_type::alignment(); + static constexpr auto simd_size = simd_type::size; + static constexpr auto alignment = arch_t::alignment(); // Kernel values stored in consecutive memory. This allows us to compute // values in all three directions in a single kernel evaluation call. static constexpr auto ns2 = ns * FLT(0.5); // half spread width alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2, 0); // initialized to 0 due to the padding for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); + const auto dd_pt = initialize_complex_register(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); const auto x1 = (FLT)std::ceil(kx[pt] - ns2) - kx[pt]; const auto x2 = (FLT)std::ceil(ky[pt] - ns2) - ky[pt]; - // alignas(alignment) const auto kernel_values = - ker_eval(kernel_values.data(), opts, x1, x2); + ker_eval(kernel_values.data(), opts, x1, x2); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop // here 2* is because of complex - static constexpr uint8_t batches = (2 * ns + padding) / avx_size; - static_assert(batches > 0, "batches must be greater than 0"); - batch_t ker1val_batches[batches]; - - for (uint8_t i = 0; i < (batches & ~1); i += 2) { - const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); - const auto ker00 = xsimd::swizzle(ker01, zip_low_index); - const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); - ker1val_batches[i] = ker00 * dd_pt; - ker1val_batches[i + 1] = ker11 * dd_pt; - } - if constexpr (batches % 2) { - const auto ker1_batch = - batch_t::load_unaligned(ker1 + (batches - 1) * avx_size / 2); - const auto res = xsimd::swizzle(ker1_batch, zip_low_index) * dd_pt; - ker1val_batches[batches - 1] = res; - } + static constexpr uint8_t kerval_vectors = (2 * ns + padding) / simd_size; + static_assert(kerval_vectors > 0, "kerval_vectors must be greater than 0"); + // wrapping this in a lambda gives an extra 10% speedup (gcc13) + // the compiler realizes the values are constant after the lambda + // Guess: it realizes what is the invariant and moves some operations outside the loop + // it might also realize that some variables are not needed anymore and can + // re-use the registers with other data. + const auto ker1val_v = [ker1, dd_pt]() constexpr noexcept { + // array of simd_registers that will store the kernel values + std::array ker1val_v{}; + // similar to the 1D case, we compute the kernel values in advance + // and store them in simd_registers. + // Compared to the 1D case the difference is that here ker values are stored in + // an array of simd_registers. + // This is a hint to the compiler to keep the values in registers, instead of + // pushing them to the stack. + // Same as the 1D case, the loop is structured in a way to half the number of loads + // This cause an issue with the last elements, but this is handled in the + // `if constexpr`. + // For more details please read the 1D case. The difference is that + // here the loop is on the number of simd vectors In the 1D case the loop is on the + // number of elements in the kernel + for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable) + i += 2) { + const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + // this initializes the entire vector registers with the same value + // the ker1val_v[i] looks like this: + // +-----------------------+ + // |y0|y0|y0|y0|y0|y0|y0|y0| + // +-----------------------+ + ker1val_v[i] = ker1low * dd_pt; + ker1val_v[i + 1] = ker1hi * dd_pt; // same as above + } + if constexpr (kerval_vectors % 2) { + const auto ker1_v = + simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + ker1val_v[kerval_vectors - 1] = res; + } + return ker1val_v; + }(); + // critical inner loop: for (auto dy = 0; dy < ns; ++dy) { const auto j = size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid auto *FINUFFT_RESTRICT trg = du + 2 * j; - const batch_t kerval_batch(ker2[dy]); - for (uint8_t i = 0; i < batches; ++i) { - const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); - const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); - result.store_unaligned(trg + i * avx_size); + const simd_type kerval_v(ker2[dy]); + for (uint8_t i = 0; i < kerval_vectors; ++i) { + const auto trg_v = simd_type::load_unaligned(trg + i * simd_size); + const auto result = xsimd::fma(kerval_v, ker1val_v[i], trg_v); + result.store_unaligned(trg + i * simd_size); } } } @@ -1320,16 +1358,18 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const BIGINT size2, const BIGINT size3, FLT *FINUFFT_RESTRICT du, const BIGINT M, const FLT *kx, const FLT *ky, const FLT *kz, const FLT *dd, const finufft_spread_opts &opts) noexcept { - using batch_t = PaddedSIMD; - using arch_t = typename batch_t::arch_type; + using simd_type = PaddedSIMD; + using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto avx_size = batch_t::size; - static constexpr auto alignment = batch_t::arch_type::alignment(); - static constexpr auto ns2 = ns * FLT(0.5); // half spread width + static constexpr auto simd_size = simd_type::size; + static constexpr auto alignment = arch_t::alignment(); + + static constexpr auto ns2 = ns * FLT(0.5); // half spread width alignas(alignment) std::array kernel_values{0}; std::fill(du, du + 2 * size1 * size2 * size3, 0); + for (uint64_t pt = 0; pt < M; pt++) { // loop over NU pts - const auto dd_pt = initialize_complex_batch(dd[pt * 2], dd[pt * 2 + 1]); + const auto dd_pt = initialize_complex_register(dd[pt * 2], dd[pt * 2 + 1]); // ceil offset, hence rounding, must match that in get_subgrid... const auto i1 = (BIGINT)std::ceil(kx[pt] - ns2); // fine grid start indices const auto i2 = (BIGINT)std::ceil(ky[pt] - ns2); @@ -1338,48 +1378,50 @@ FINUFFT_NEVER_INLINE void spread_subproblem_3d_kernel( const auto x2 = std::ceil(ky[pt] - ns2) - ky[pt]; const auto x3 = std::ceil(kz[pt] - ns2) - kz[pt]; - ker_eval(kernel_values.data(), opts, x1, x2, x3); + ker_eval(kernel_values.data(), opts, x1, x2, x3); const auto *ker1 = kernel_values.data(); const auto *ker2 = kernel_values.data() + MAX_NSPREAD; const auto *ker3 = kernel_values.data() + 2 * MAX_NSPREAD; // Combine kernel with complex source value to simplify inner loop // here 2* is because of complex - // Batches is the number of SIMD iterations needed to compute all the elements - static constexpr uint8_t batches = (2 * ns + padding) / avx_size; - static_assert(batches > 0, "batches must be greater than 0"); - batch_t ker1val_batches[batches]; - // Iterate over batches but in case the number of batches is odd - // we need to handle the last batch separately - // to the & ~1 is to ensure that we do not iterate over the last batch if it is odd - // as it sets the last bit to 0 - for (uint8_t i = 0; i < (batches & ~1); i += 2) { - const auto ker01 = batch_t::load_aligned(ker1 + i * avx_size / 2); - const auto ker00 = xsimd::swizzle(ker01, zip_low_index); - const auto ker11 = xsimd::swizzle(ker01, zip_hi_index); - ker1val_batches[i] = ker00 * dd_pt; - ker1val_batches[i + 1] = ker11 * dd_pt; - } - - // (at compile time) check if the number of batches is odd - // if it is we need to handle the last batch separately - if constexpr (batches % 2) { - const auto ker1_batch = - batch_t::load_unaligned(ker1 + (batches - 1) * avx_size / 2); - const auto res = xsimd::swizzle(ker1_batch, zip_low_index) * dd_pt; - ker1val_batches[batches - 1] = res; - } + // kerval_vectors is the number of SIMD iterations needed to compute all the elements + static constexpr uint8_t kerval_vectors = (2 * ns + padding) / simd_size; + static_assert(kerval_vectors > 0, "kerval_vectors must be greater than 0"); + const auto ker1val_v = [ker1, dd_pt]() constexpr noexcept { + std::array ker1val_v{}; + // Iterate over kerval_vectors but in case the number of kerval_vectors is odd + // we need to handle the last batch separately + // to the & ~1 is to ensure that we do not iterate over the last batch if it is odd + // as it sets the last bit to 0 + for (uint8_t i = 0; i < (kerval_vectors & ~1); // NOLINT(*-too-small-loop-variable + i += 2) { + const auto ker1_v = simd_type::load_aligned(ker1 + i * simd_size / 2); + const auto ker1low = xsimd::swizzle(ker1_v, zip_low_index); + const auto ker1hi = xsimd::swizzle(ker1_v, zip_hi_index); + ker1val_v[i] = ker1low * dd_pt; + ker1val_v[i + 1] = ker1hi * dd_pt; + } + // (at compile time) check if the number of kerval_vectors is odd + // if it is we need to handle the last batch separately + if constexpr (kerval_vectors % 2) { + const auto ker1_v = + simd_type::load_unaligned(ker1 + (kerval_vectors - 1) * simd_size / 2); + const auto res = xsimd::swizzle(ker1_v, zip_low_index) * dd_pt; + ker1val_v[kerval_vectors - 1] = res; + } + return ker1val_v; + }(); // critical inner loop: for (uint8_t dz{0}; dz < ns; ++dz) { const auto oz = size1 * size2 * (i3 - off3 + dz); // offset due to z for (uint8_t dy{0}; dy < ns; ++dy) { const auto j = oz + size1 * (i2 - off2 + dy) + i1 - off1; // should be in subgrid auto *FINUFFT_RESTRICT trg = du + 2 * j; - const auto kerval = ker2[dy] * ker3[dz]; - const batch_t kerval_batch(kerval); - for (uint8_t i{0}; i < batches; ++i) { - const auto trg_batch = batch_t::load_unaligned(trg + i * avx_size); - const auto result = xsimd::fma(kerval_batch, ker1val_batches[i], trg_batch); - result.store_unaligned(trg + i * avx_size); + const simd_type kerval_v(ker2[dy] * ker3[dz]); + for (uint8_t i{0}; i < kerval_vectors; ++i) { + const auto trg_v = simd_type::load_unaligned(trg + i * simd_size); + const auto result = xsimd::fma(kerval_v, ker1val_v[i], trg_v); + result.store_unaligned(trg + i * simd_size); } } } @@ -1720,13 +1762,13 @@ FLT fold_rescale(const FLT x, const BIGINT N) noexcept { return (result - floor(result)) * FLT(N); } -template +template auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, const V... elems) noexcept { /* Utility function that allows to move the kernel evaluation outside the spreader for clarity Inputs are: ns = kernel width kerevalmeth = kernel evaluation method T = - (single or double precision) type of the kernel batch_t = batch type for Horner - vectorization (default is the optimal batch size) finufft_spread_opts as Horner needs + (single or double precision) type of the kernel simd_type = xsimd::batch for Horner + vectorization (default is the optimal simd size) finufft_spread_opts as Horner needs the oversampling factor elems = kernel arguments examples usage is ker_eval(opts, x, y, z) // for 3D or ker_eval(opts, x, y) // for 2D or ker_eval(opts, x) // for 1D @@ -1736,10 +1778,10 @@ auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, for (auto i = 0; i < sizeof...(elems); ++i) { // compile time branch no performance overhead if constexpr (kerevalmeth == 1) { - eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], opts); + eval_kernel_vec_Horner(ker + (i * MAX_NSPREAD), inputs[i], opts); } if constexpr (kerevalmeth == 0) { - alignas(batch_t::arch_type::alignment()) std::array kernel_args{}; + alignas(simd_type::arch_type::alignment()) std::array kernel_args{}; set_kernel_args(kernel_args.data(), inputs[i], opts); evaluate_kernel_vector(ker + (i * MAX_NSPREAD), kernel_args.data(), opts, ns); } @@ -1773,7 +1815,7 @@ constexpr T generate_sequence_impl(V a, V b, index_sequence) noexcept { } template -constexpr auto initialize_complex_batch(V a, V b) noexcept { +constexpr auto initialize_complex_register(V a, V b) noexcept { // populates a SIMD register with a and b interleaved // for example: // +-------------------------------+ @@ -1796,46 +1838,47 @@ template constexpr auto BestSIMDHelper() { } } -template constexpr uint8_t min_batch_size() { - // finds the smallest batch size that can handle N elements - // batch size is the SIMD width in xsimd terminology +template constexpr uint8_t min_simd_width() { + // finds the smallest simd width that can handle N elements + // simd size is batch size the SIMD width in xsimd terminology if constexpr (std::is_void_v>) { - return min_batch_size(); + return min_simd_width(); } else { return N; } }; -template constexpr auto find_optimal_batch_size() { - // finds the smallest batch size that minimizes the number of iterations +template constexpr auto find_optimal_simd_width() { + // finds the smallest simd width that minimizes the number of iterations // NOTE: might be suboptimal for some cases 2^N+1 for example // in the future we might want to implement a more sophisticated algorithm - uint8_t optimal_batch_size = min_batch_size(); - uint8_t min_iterations = (N + optimal_batch_size - 1) / optimal_batch_size; - for (uint8_t batch_size = optimal_batch_size; - batch_size <= xsimd::batch::size; - batch_size *= 2) { - uint8_t iterations = (N + batch_size - 1) / batch_size; + uint8_t optimal_simd_width = min_simd_width(); + uint8_t min_iterations = (N + optimal_simd_width - 1) / optimal_simd_width; + for (uint8_t simd_width = optimal_simd_width; + simd_width <= xsimd::batch::size; + simd_width *= 2) { + uint8_t iterations = (N + simd_width - 1) / simd_width; if (iterations < min_iterations) { min_iterations = iterations; - optimal_batch_size = batch_size; + optimal_simd_width = simd_width; } } - return optimal_batch_size; + return optimal_simd_width; } -template constexpr auto GetPaddedSIMDSize() { - // helper function to get the SIMD size with padding for the given number of elements +template constexpr auto GetPaddedSIMDWidth() { + // helper function to get the SIMD width with padding for the given number of elements // that minimizes the number of iterations - return xsimd::make_sized_batch()>::type::size; + return xsimd::make_sized_batch()>::type::size; } template constexpr auto get_padding() { // helper function to get the padding for the given number of elements - // ns is known at compile time - // rounds ns to the next multiple of the SIMD width - // then subtracts ns to get the padding - constexpr uint8_t width = GetPaddedSIMDSize(); + // ns is known at compile time, rounds ns to the next multiple of the SIMD width + // then subtracts ns to get the padding using a bitwise and trick + // WARING: this trick works only for power of 2s + // SOURCE: Agner Fog's VCL manual + constexpr uint8_t width = GetPaddedSIMDWidth(); return ((ns + width - 1) & (-width)) - ns; } @@ -1843,7 +1886,7 @@ template constexpr auto get_padding_helper(uint8_t runtime_ // helper function to get the padding for the given number of elements where ns is // known at runtime, it uses recursion to find the padding // this allows to avoid having a function with a large number of switch cases - // as GetPaddedSIMDSize requires a compile time value + // as GetPaddedSIMDWidth requires a compile time value // it cannot be a lambda function because of the template recursion if constexpr (ns < 2) { return 0; From 16973b5c87900dbd10109162bafb80158ddab27b Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 19:31:03 -0400 Subject: [PATCH 23/35] fixed horner files --- makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile b/makefile index a3020ce0b..bd0c65c79 100644 --- a/makefile +++ b/makefile @@ -169,7 +169,7 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(FC) -DSINGLE -c $(FFLAGS) $< -o $@ # included auto-generated code dependency... -src/spreadinterp.o: src/ker_horner_allw_loop.c src/ker_lowupsampfac_horner_allw_loop.c +src/spreadinterp.o: src/ker_horner_allw_loop_constexpr.h src/ker_lowupsampfac_horner_allw_loop_constexpr.c # lib ----------------------------------------------------------------------- From 468844f4561d39c7de0899507d428b7a8b53d5d1 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 19:47:58 -0400 Subject: [PATCH 24/35] using const ref when possible --- include/finufft/spreadinterp.h | 19 +++++------- src/spreadinterp.cpp | 54 +++++++++++++++++----------------- 2 files changed, 35 insertions(+), 38 deletions(-) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 0900dd31b..a909e4b37 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -31,28 +31,25 @@ namespace spreadinterp { // things external (spreadinterp) interface needs... FINUFFT_EXPORT int FINUFFT_CDECL spreadinterp( BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, - FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts); + FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - finufft_spread_opts opts); + const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - finufft_spread_opts opts); + const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL interpSorted( BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, - int did_sort); + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted( - BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, - int did_sort); + const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, + BIGINT M, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform, + const finufft_spread_opts &opts, int did_sort); FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts, + FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts, int did_sort); FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts); -FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel_noexp(FLT x, - const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, int debug, int showwarn, int dim); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index c4df21207..7e83ba727 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -104,7 +104,7 @@ static void get_subgrid(BIGINT &offset1, BIGINT &offset2, BIGINT &offset3, // ========================================================================== int spreadinterp(BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, - FLT *ky, FLT *kz, FLT *data_nonuniform, finufft_spread_opts opts) + FLT *ky, FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts) /* ------------Spreader/interpolator for 1, 2, or 3 dimensions -------------- If opts.spread_direction=1, evaluate, in the 1D case, @@ -196,7 +196,7 @@ static int ndims_from_Ns(BIGINT N1, BIGINT N2, BIGINT N3) } int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - finufft_spread_opts opts) + const finufft_spread_opts &opts) /* This does just the input checking and reporting for the spreader. See spreadinterp() for input arguments and meaning of returned value. Split out by Melody Shih, Jun 2018. Finiteness chk Barnett 7/30/18. @@ -220,7 +220,7 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT } int indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, - FLT *ky, FLT *kz, finufft_spread_opts opts) + FLT *ky, FLT *kz, const finufft_spread_opts &opts) /* This makes a decision whether or not to sort the NU pts (influenced by opts.sort), and if yes, calls either single- or multi-threaded bin sort, writing reordered index list to sort_indices. If decided not to sort, the @@ -298,7 +298,8 @@ int indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, F int spreadinterpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, int did_sort) + FLT *data_nonuniform, const finufft_spread_opts &opts, + int did_sort) /* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. See spreadinterp() above for inputs arguments and definitions. Return value should always be 0 (no error reporting). @@ -311,15 +312,16 @@ int spreadinterpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, else // ================= direction 2 (interpolation) =========== interpSorted(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, - opts, did_sort); + opts); return 0; } // -------------------------------------------------------------------------- -int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, - BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - finufft_spread_opts opts, int did_sort) +int spreadSorted(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + const FLT *data_nonuniform, const finufft_spread_opts &opts, + int did_sort) // Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. { CNTime timer; @@ -441,9 +443,10 @@ int spreadSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *dat // -------------------------------------------------------------------------- template -int interpSorted_kernel(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, int did_sort) +static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, + const BIGINT N2, const BIGINT N3, const FLT *data_uniform, + const BIGINT M, FLT *kx, FLT *ky, FLT *kz, + FLT *data_nonuniform, const finufft_spread_opts &opts) // Interpolate to NU pts in sorted order from a uniform grid. // See spreadinterp() for doc. { @@ -546,43 +549,40 @@ int interpSorted_kernel(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT } template -int interpSorted_dispatch(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, finufft_spread_opts opts, int did_sort) { +static int interpSorted_dispatch(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + FLT *data_nonuniform, const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case if (opts.kerevalmeth) - return interpSorted_kernel(sort_indices, N1, N2, N3, - data_uniform, M, kx, ky, kz, - data_nonuniform, opts, did_sort); + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); else { - return interpSorted_kernel(sort_indices, N1, N2, N3, - data_uniform, M, kx, ky, kz, - data_nonuniform, opts, did_sort); + return interpSorted_kernel( + sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); } } else { if (opts.nspread == NS) { if (opts.kerevalmeth) { return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts, did_sort); + kx, ky, kz, data_nonuniform, opts); } else { return interpSorted_kernel(sort_indices, N1, N2, N3, data_uniform, M, - kx, ky, kz, data_nonuniform, opts, - did_sort); + kx, ky, kz, data_nonuniform, opts); } } else { return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts, did_sort); + ky, kz, data_nonuniform, opts); } } } int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, - finufft_spread_opts opts, int did_sort) { + const finufft_spread_opts &opts) { return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, - ky, kz, data_nonuniform, opts, did_sort); + ky, kz, data_nonuniform, opts); } /////////////////////////////////////////////////////////////////////////// @@ -760,7 +760,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ alignas(alignment) static constexpr auto padded_coeffs = pad_2D_array_with_zeros(horner_coeffs); - const auto zv = simd_type(z); + const simd_type zv(z); for (uint8_t i = 0; i < w; i += simd_size) { auto k = simd_type::load_aligned(padded_coeffs[0].data() + i); From 132d7663fd6f5d3923752614f27464db3819897c Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 19:53:31 -0400 Subject: [PATCH 25/35] using const ref when possible --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1626ad35e..129bc2ff9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,8 +7,8 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(GNU_LIKE_FRONTENDS AppleClang Clang GNU) if (CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) # Set custom compiler flags for gcc-compatible compilers - set(FINUFFT_CXX_FLAGS_RELEASE -funroll-loops -ffp-contract=fast) - set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -O3 -g -DNDEBUG ${FINUFFT_CXX_FLAGS_RELEASE}) + set(FINUFFT_CXX_FLAGS_RELEASE -O3 -funroll-loops -ffp-contract=fast) + set(FINUFFT_CXX_FLAGS_RELWITHDEBINFO -g ${FINUFFT_CXX_FLAGS_RELEASE}) endif () include(CTest) From aabdb57c7b3e63008cea30e6ebaea2a6a2001995 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 20:09:33 -0400 Subject: [PATCH 26/35] using const and restrict where possible --- include/finufft/spreadinterp.h | 13 +++--- src/spreadinterp.cpp | 82 +++++++++++++++++++--------------- 2 files changed, 53 insertions(+), 42 deletions(-) diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index a909e4b37..d7a513bb5 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -39,16 +39,19 @@ FINUFFT_EXPORT int FINUFFT_CDECL indexSort(BIGINT *sort_indices, BIGINT N1, BIGI BIGINT N3, BIGINT M, FLT *kx, FLT *ky, FLT *kz, const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL interpSorted( - BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts); + const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL spreadSorted( const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, const FLT *data_nonuniform, const finufft_spread_opts &opts, int did_sort); FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted( - BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, BIGINT M, - FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, const finufft_spread_opts &opts, - int did_sort); + const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts, int did_sort); FINUFFT_EXPORT FLT FINUFFT_CDECL evaluate_kernel(FLT x, const finufft_spread_opts &opts); FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac, int kerevalmeth, diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 7e83ba727..eff528ef9 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -7,12 +7,12 @@ #include #include "ker_horner_allw_loop_constexpr.h" + #include -#include -#include -#include -#include +#include +#include +#include #include using namespace std; @@ -296,10 +296,11 @@ int indexSort(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, F return did_sort; } -int spreadinterpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, const finufft_spread_opts &opts, - int did_sort) +int spreadinterpSorted(const BIGINT *sort_indices, const BIGINT N1, const BIGINT N2, + const BIGINT N3, FLT *data_uniform, const BIGINT M, + FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, + FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, + const finufft_spread_opts &opts, int did_sort) /* Logic to select the main spreading (dir=1) vs interpolation (dir=2) routine. See spreadinterp() above for inputs arguments and definitions. Return value should always be 0 (no error reporting). @@ -319,7 +320,8 @@ int spreadinterpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, // -------------------------------------------------------------------------- int spreadSorted(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, + FLT *FINUFFT_RESTRICT data_uniform, BIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, const FLT *data_nonuniform, const finufft_spread_opts &opts, int did_sort) // Spread NU pts in sorted order to a uniform grid. See spreadinterp() for doc. @@ -443,10 +445,11 @@ int spreadSorted(const BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, // -------------------------------------------------------------------------- template -static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, - const BIGINT N2, const BIGINT N3, const FLT *data_uniform, - const BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, const finufft_spread_opts &opts) +FINUFFT_NEVER_INLINE static int interpSorted_kernel( + const BIGINT *sort_indices, const BIGINT N1, const BIGINT N2, const BIGINT N3, + const FLT *data_uniform, const BIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) // Interpolate to NU pts in sorted order from a uniform grid. // See spreadinterp() for doc. { @@ -476,9 +479,9 @@ static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, FLT outbuf[2 * CHUNKSIZE]; // Kernels: static alloc is faster, so we do it for up to 3D... alignas(alignment) std::array kernel_values{0}; - FLT *FINUFFT_RESTRICT ker1 = kernel_values.data(); - FLT *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; - FLT *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; + auto *FINUFFT_RESTRICT ker1 = kernel_values.data(); + auto *FINUFFT_RESTRICT ker2 = kernel_values.data() + MAX_NSPREAD; + auto *FINUFFT_RESTRICT ker3 = kernel_values.data() + 2 * MAX_NSPREAD; // Loop over interpolation chunks #pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: @@ -487,7 +490,7 @@ static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, { // Setup buffers for this chunk - int bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; + const int bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; for (int ibuf = 0; ibuf < bufsize; ibuf++) { BIGINT j = sort_indices[i + ibuf]; jlist[ibuf] = j; @@ -498,20 +501,20 @@ static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, // Loop over targets in chunk for (int ibuf = 0; ibuf < bufsize; ibuf++) { - FLT xj = xjlist[ibuf]; - FLT yj = (ndims > 1) ? yjlist[ibuf] : 0; - FLT zj = (ndims > 2) ? zjlist[ibuf] : 0; + const auto xj = xjlist[ibuf]; + const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; + const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; - FLT *target = outbuf + 2 * ibuf; + auto *FINUFFT_RESTRICT target = outbuf + 2 * ibuf; // coords (x,y,z), spread block corner index (i1,i2,i3) of current NU targ - BIGINT i1 = (BIGINT)std::ceil(xj - ns2); // leftmost grid index - BIGINT i2 = (ndims > 1) ? (BIGINT)std::ceil(yj - ns2) : 0; // min y grid index - BIGINT i3 = (ndims > 2) ? (BIGINT)std::ceil(zj - ns2) : 0; // min z grid index + const auto i1 = (BIGINT)std::ceil(xj - ns2); // leftmost grid index + const auto i2 = (ndims > 1) ? (BIGINT)std::ceil(yj - ns2) : 0; // min y grid index + const auto i3 = (ndims > 2) ? (BIGINT)std::ceil(zj - ns2) : 0; // min z grid index - FLT x1 = (FLT)i1 - xj; // shift of ker center, in [-w/2,-w/2+1] - FLT x2 = (ndims > 1) ? (FLT)i2 - yj : 0; - FLT x3 = (ndims > 2) ? (FLT)i3 - zj : 0; + const auto x1 = (FLT)i1 - xj; // shift of ker center, in [-w/2,-w/2+1] + const auto x2 = (ndims > 1) ? (FLT)i2 - yj : 0; + const auto x3 = (ndims > 2) ? (FLT)i3 - zj : 0; // eval kernel values patch and use to interpolate from uniform data... if (!(opts.flags & TF_OMIT_SPREADING)) { @@ -537,7 +540,7 @@ static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, // Copy result buffer to output array for (int ibuf = 0; ibuf < bufsize; ibuf++) { - BIGINT j = jlist[ibuf]; + const UBIGINT j = jlist[ibuf]; data_nonuniform[2 * j] = outbuf[2 * ibuf]; data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; } @@ -549,9 +552,11 @@ static int interpSorted_kernel(const BIGINT *sort_indices, const BIGINT N1, } template -static int interpSorted_dispatch(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, - FLT *data_uniform, BIGINT M, FLT *kx, FLT *ky, FLT *kz, - FLT *data_nonuniform, const finufft_spread_opts &opts) { +static int interpSorted_dispatch( + const BIGINT *sort_indices, const BIGINT N1, const BIGINT N2, const BIGINT N3, + FLT *FINUFFT_RESTRICT data_uniform, const BIGINT M, FLT *FINUFFT_RESTRICT kx, + FLT *FINUFFT_RESTRICT ky, FLT *FINUFFT_RESTRICT kz, + FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { static_assert(MIN_NSPREAD <= NS <= MAX_NSPREAD, "NS must be in the range (MIN_NSPREAD, MAX_NSPREAD)"); if constexpr (NS == MIN_NSPREAD) { // Base case @@ -578,8 +583,10 @@ static int interpSorted_dispatch(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIG } } -int interpSorted(BIGINT *sort_indices, BIGINT N1, BIGINT N2, BIGINT N3, FLT *data_uniform, - BIGINT M, FLT *kx, FLT *ky, FLT *kz, FLT *data_nonuniform, +int interpSorted(const BIGINT *sort_indices, const BIGINT N1, const BIGINT N2, + const BIGINT N3, FLT *FINUFFT_RESTRICT data_uniform, const BIGINT M, + FLT *FINUFFT_RESTRICT kx, FLT *FINUFFT_RESTRICT ky, + FLT *FINUFFT_RESTRICT kz, FLT *FINUFFT_RESTRICT data_nonuniform, const finufft_spread_opts &opts) { return interpSorted_dispatch(sort_indices, N1, N2, N3, data_uniform, M, kx, ky, kz, data_nonuniform, opts); @@ -781,7 +788,7 @@ Two upsampfacs implemented. Params must match ref formula. Barnett 4/24/18 */ template void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, BIGINT i1, - BIGINT N1) + const BIGINT N1) /* 1D interpolate complex values from size-ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the 1d kernel evaluation list ker1. @@ -839,7 +846,8 @@ void interp_line(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker, BI template void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, BIGINT i1, BIGINT i2, BIGINT N1, BIGINT N2) + const FLT *ker2, const BIGINT i1, const BIGINT i2, const BIGINT N1, + const BIGINT N2) /* 2D interpolate complex values from a ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns outer product of the 1d kernel lists ker1 and ker2. @@ -919,8 +927,8 @@ void interp_square(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, template void interp_cube(FLT *FINUFFT_RESTRICT target, const FLT *du, const FLT *ker1, - const FLT *ker2, const FLT *ker3, BIGINT i1, BIGINT i2, BIGINT i3, - BIGINT N1, BIGINT N2, BIGINT N3) + const FLT *ker2, const FLT *ker3, const BIGINT i1, const BIGINT i2, + const BIGINT i3, const BIGINT N1, const BIGINT N2, const BIGINT N3) /* 3D interpolate complex values from a ns*ns*ns block of the du (uniform grid data) array to a single complex output value "target", using as weights the ns*ns*ns outer product of the 1d kernel lists ker1, ker2, and ker3. From 3efa84cedb9c122d2059a935263ef4257cb1d8aa Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Thu, 20 Jun 2024 20:18:49 -0400 Subject: [PATCH 27/35] using const and restrict where possible --- include/finufft/defs.h | 4 ++++ src/spreadinterp.cpp | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/finufft/defs.h b/include/finufft/defs.h index df7ad4249..569089c4f 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -43,14 +43,18 @@ #define FINUFFT_ALWAYS_INLINE __forceinline #define FINUFFT_NEVER_INLINE __declspec(noinline) #define FINUFFT_RESTRICT __restrict +#define FINUFFT_UNREACHABLE __assume(0) + #elif defined(__GNUC__) || defined(__clang__) #define FINUFFT_ALWAYS_INLINE __attribute__((always_inline)) inline #define FINUFFT_NEVER_INLINE __attribute__((noinline)) #define FINUFFT_RESTRICT __restrict__ +#define FINUFFT_UNREACHABLE __builtin_unreachable() #else #define FINUFFT_ALWAYS_INLINE inline #define FINUFFT_NEVER_INLINE #define FINUFFT_RESTRICT +#define FINUFFT_UNREACHABLE #endif // ------------- Library-wide algorithm parameter settings ---------------- diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index eff528ef9..26c2ef56a 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -461,8 +461,8 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( static constexpr auto ns2 = ns * FLT(0.5); // half spread width, used as stencil shift CNTime timer; - int ndims = ndims_from_Ns(N1, N2, N3); - int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp + const int ndims = ndims_from_Ns(N1, N2, N3); + int nthr = MY_OMP_GET_MAX_THREADS(); // guess # threads to use to interp if (opts.nthreads > 0) nthr = opts.nthreads; // user override, now without limit #ifndef _OPENMP nthr = 1; // single-threaded lib must override user @@ -533,6 +533,7 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( N3); break; default: // can't get here + FINUFFT_UNREACHABLE; break; } } From 3e243269ae52f004bef36b6bd22985806fa03b19 Mon Sep 17 00:00:00 2001 From: Marco Barbone Date: Fri, 21 Jun 2024 14:21:05 -0400 Subject: [PATCH 28/35] re-added inline to ker_eval --- src/spreadinterp.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 26c2ef56a..3212e6705 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -55,8 +55,9 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset template()>, typename... V> -static auto ker_eval(FLT *FINUFFT_RESTRICT ker, const finufft_spread_opts &opts, - const V... elems) noexcept; +static FINUFFT_ALWAYS_INLINE auto ker_eval(FLT *FINUFFT_RESTRICT ker, + const finufft_spread_opts &opts, + const V... elems) noexcept; static FINUFFT_ALWAYS_INLINE FLT fold_rescale(FLT x, BIGINT N) noexcept; static FINUFFT_ALWAYS_INLINE void set_kernel_args( FLT *args, FLT x, const finufft_spread_opts &opts) noexcept; From 6d1639817d6c808589c54b78ad4b635922ee43ac Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Fri, 21 Jun 2024 17:19:21 -0400 Subject: [PATCH 29/35] doc clang-format --- docs/devnotes.rst | 3 +++ perftest/compare_spreads.jl | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docs/devnotes.rst b/docs/devnotes.rst index 2a3683761..57b400f16 100644 --- a/docs/devnotes.rst +++ b/docs/devnotes.rst @@ -17,6 +17,9 @@ Developer notes * There are some sphinx tags in the source code, indicated by @ in comments. Please leave these alone since they are needed by the doc generation. +* Source code is now in clang format: devs should run ``clang-format --files= -i --style=.clang-format`` before pushing, or set up their editor to do this + automatically. + * If you add a new option field (recall it must be plain C style only, no special types) to ``include/finufft_opts.h``, don't forget to add it to ``include/finufft.fh``, ``include/finufft_mod.f90``, ``matlab/finufft.mw``, ``python/finufft/_finufft.py``, and the Julia interface, as well a paragraph describing its use in the docs. Also to set its default value in ``src/finufft.cpp``. You will then need to regenerate the docs as in ``docs/README``. * For testing and performance measuring routines see ``test/README`` and ``perftest/README``. We need more of the latter, eg, something making performance graphs that enable rapid eyeball comparison of various settings/machines. Marco is working on that. diff --git a/perftest/compare_spreads.jl b/perftest/compare_spreads.jl index f7dec6db1..6d1e8b780 100644 --- a/perftest/compare_spreads.jl +++ b/perftest/compare_spreads.jl @@ -4,7 +4,7 @@ using CairoMakie using JLD2 # for load/save arrays to file using UnPack -fnam = "results/master-vs-svec2l_gcc114_5700U_nthr8" # outfile head +fnam = "results/master-vs-svec2_gcc114_5700U_nthr1" # outfile head # locations of pair of FINUFFT repos to compare... repo1 = "/home/alex/numerics/finufft" repo2 = "/home/alex/numerics/nufft/finufft-svec2" @@ -62,7 +62,7 @@ function plot_all(fnam,ts,wstr,dims,M,N,nthr) end # main script........................................................................... -nthr = 8; # 1: leave cpu freq at max (4.3GHz); for 8, lower to 2.7GHz since drops to this. +nthr = 1; # 1: leave cpu freq at max (4.3GHz); for 8, lower to 2.7GHz since drops to this. # set freq lim with cpupower-gui # check with: watch -n 1 sort -nr /sys/devices/system/cpu/cpu*/cpufreq/scaling_cur_freq dims = 1:3 From 076e0db2b6c2a6194ce2c56013b405538a74bbf5 Mon Sep 17 00:00:00 2001 From: Libin Lu Date: Fri, 21 Jun 2024 19:31:02 -0400 Subject: [PATCH 30/35] try to make ci work --- .github/workflows/python_wheel.yml | 5 +---- CMakeLists.txt | 2 ++ cmake/setupXSIMD.cmake | 7 +++++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml index 688589ab0..5f25feca5 100644 --- a/.github/workflows/python_wheel.yml +++ b/.github/workflows/python_wheel.yml @@ -186,12 +186,9 @@ jobs: steps: - uses: actions/checkout@v4 - - uses: git-for-windows/setup-git-for-windows-sdk - with: - flavor: minimal - name: Install GCC and make - run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw" + run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw git" - name: Build and Test Python 3.8 uses: actions/setup-python@v5 diff --git a/CMakeLists.txt b/CMakeLists.txt index 129bc2ff9..e011d1c27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -161,6 +161,8 @@ function(set_finufft_options target) target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR}) endif () + # XSIMD is a header only library, so we just need to include the headers + target_include_directories(${target} PUBLIC ${XSIMD_INCLUDE_DIR}) endfunction() if (FINUFFT_USE_CPU) diff --git a/cmake/setupXSIMD.cmake b/cmake/setupXSIMD.cmake index 303ef2986..f3d6aefd0 100644 --- a/cmake/setupXSIMD.cmake +++ b/cmake/setupXSIMD.cmake @@ -1,5 +1,5 @@ CPMAddPackage( - NAME findxtl + NAME xtl GIT_REPOSITORY "https://github.com/xtensor-stack/xtl.git" GIT_TAG ${XTL_VERSION} EXCLUDE_FROM_ALL YES @@ -8,7 +8,7 @@ CPMAddPackage( ) CPMAddPackage( - NAME findxsimd + NAME xsimd GIT_REPOSITORY "https://github.com/xtensor-stack/xsimd.git" GIT_TAG ${XSIMD_VERSION} EXCLUDE_FROM_ALL YES @@ -17,3 +17,6 @@ CPMAddPackage( "XSIMD_SKIP_INSTALL YES" "XSIMD_ENABLE_XTL_COMPLEX YES" ) + +get_property(XSIMD_SOURCE_DIR TARGET xsimd PROPERTY SOURCE_DIR) +set(XSIMD_INCLUDE_DIR ${XSIMD_SOURCE_DIR}/include) From c7db9bf3bc6276580e3c0a5715f0725d8d691293 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Fri, 21 Jun 2024 20:28:22 -0400 Subject: [PATCH 31/35] makefile prevent rebuilding spreadinterp{_32}.o every task --- makefile | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/makefile b/makefile index bd0c65c79..f9348f716 100644 --- a/makefile +++ b/makefile @@ -13,6 +13,7 @@ # Barnett tidying Feb, May 2020. Libin Lu edits, 2020. # Garrett Wright, Joakim Anden, Barnett: dual-prec lib build, Jun-Jul'20. # Windows compatibility, jonas-kr, Sep '20. +# XSIMD dependency, Marco Barbone, June 2024. # Compiler (CXX), and linking from C, fortran. We use GCC by default... CXX = g++ @@ -26,7 +27,7 @@ PYTHON = python3 # Notes: 1) -Ofast breaks isfinite() & isnan(), so use -O3 which now is as fast # 2) -fcx-limited-range for fortran-speed complex arith in C++ # 3) we use simply-expanded (:=) makefile variables, otherwise confusing -CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range $(CFLAGS) +CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range -ffp-contract=fast $(CFLAGS) FFLAGS := $(CFLAGS) $(FFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) # FFTW base name, and math linking... @@ -50,10 +51,10 @@ OFLAGS = # For experts only, location of MWrap executable (see docs/install.rst): MWRAP = mwrap -# depenency root +# dependency root (relative to top directory) DEPS_ROOT := deps -# xsimd repo url +# xsimd dependency repo URL XSIMD_URL := https://github.com/xtensor-stack/xsimd.git XSIMD_VERSION := 13.0.0 XSIMD_DIR := $(DEPS_ROOT)/xsimd @@ -68,7 +69,7 @@ FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) # Now come flags that should be added, whatever user overrode in make.inc. # -fPIC (position-indep code) needed to build dyn lib (.so) # Also, we force return (via :=) to the land of simply-expanded variables... -INCL = -Iinclude +INCL = -Iinclude -I$(XSIMD_DIR)/include CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++17 CFLAGS := $(CFLAGS) $(INCL) -fPIC # here /usr/include needed for fftw3.f "fortran header"... (JiriK: no longer) @@ -121,11 +122,11 @@ OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o # all lib dual-precision objs OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) -.PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs +.PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean default: usage -all: test perftest lib examples fortran matlab octave python setup +all: test perftest lib examples fortran matlab octave python usage: @echo "Makefile for FINUFFT library. Please specify your task:" @@ -142,7 +143,8 @@ usage: @echo " make spreadtestall - small set spreader-only tests for CI use" @echo " make objclean - remove all object files, preserving libs & MEX" @echo " make clean - also remove all lib, MEX, py, and demo executables" - @echo " make setup - download dependencies" + @echo " make setup - check (and possibly download) dependencies" + @echo " make setupclean - delete downloaded dependencies" @echo "For faster (multicore) making, append, for example, -j8" @echo "" @echo "Make options:" @@ -155,9 +157,9 @@ usage: HEADERS = $(wildcard include/*.h include/finufft/*.h) # implicit rules for objects (note -o ensures writes to correct dir) -%.o: %.cpp $(HEADERS) setup +%.o: %.cpp $(HEADERS) $(CXX) -c $(CXXFLAGS) $< -o $@ -%_32.o: %.cpp $(HEADERS) setup +%_32.o: %.cpp $(HEADERS) $(CXX) -DSINGLE -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ @@ -168,8 +170,9 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) %_32.o: %.f $(FC) -DSINGLE -c $(FFLAGS) $< -o $@ -# included auto-generated code dependency... -src/spreadinterp.o: src/ker_horner_allw_loop_constexpr.h src/ker_lowupsampfac_horner_allw_loop_constexpr.c +# included auto-generated code and xsimd header-lib dependency... +src/spreadinterp.o: src/ker_horner_allw_loop_constexpr.h src/ker_lowupsampfac_horner_allw_loop_constexpr.c $(XSIMD_DIR)/include/xsimd/xsimd.hpp +src/spreadinterp_32.o: src/ker_horner_allw_loop_constexpr.h src/ker_lowupsampfac_horner_allw_loop_constexpr.c $(XSIMD_DIR)/include/xsimd/xsimd.hpp # lib ----------------------------------------------------------------------- @@ -416,11 +419,12 @@ wheel: $(STATICLIB) $(DYNLIB) docker-wheel: docker run --rm -e package_name=finufft -v `pwd`:/io libinlu/manylinux2010_x86_64_fftw /io/python/ci/build-wheels.sh -# =============================== SETUP ==================================== + +# ================== SETUP OF EXTERNAL DEPENDENCIES =============== define clone_repo - @echo "Cloning repository $(1) at tag $(2) into directory $(3)" @if [ ! -d "$(3)" ]; then \ + echo "Cloning repository $(1) at tag $(2) into directory $(3)"; \ git clone --depth=1 --branch $(2) $(1) $(3); \ else \ cd $(3) && \ @@ -435,17 +439,16 @@ define clone_repo fi endef -setup: - @echo "Downloading dependencies..." - @echo "Downloading xsimd..." +$(XSIMD_DIR)/include/xsimd/xsimd.hpp: mkdir -p $(DEPS_ROOT) + @echo "Checking xsimd external dependency..." $(call clone_repo,$(XSIMD_URL),$(XSIMD_VERSION),$(XSIMD_DIR)) - @echo "xsimd downloaded in deps/xsimd" - CXXFLAGS += -I$(XSIMD_DIR)/include + @echo "xsimd installed in deps/xsimd" setupclean: rm -rf $(DEPS_ROOT) + # =============================== DOCUMENTATION ============================= docs: docs/*.docsrc docs/matlabhelp.doc docs/makecdocs.sh @@ -458,7 +461,7 @@ docs/matlabhelp.doc: docs/genmatlabhelp.sh matlab/*.sh matlab/*.docsrc matlab/*. # =============================== CLEAN UP ================================== -clean: objclean pyclean setupclean +clean: objclean pyclean ifneq ($(MINGW),ON) # non-Windows-WSL clean up... rm -f $(STATICLIB) $(DYNLIB) From e60b966660841504bd898d441316c2657427cbe9 Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Fri, 21 Jun 2024 20:39:59 -0400 Subject: [PATCH 32/35] CHANGELOG --- CHANGELOG | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG b/CHANGELOG index d89a2f1ac..b1d4b0696 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,13 @@ List of features / changes made / release notes, in reverse chronological order. If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately). +V 2.3.0beta (6/21/24) + +* Major acceleration of spread/interp kernels using XSIMD header-only lib, + kernel evaluation, templating by ns with AVX-width-dependent decisions. + Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu). + NOTE: introduces new dependency (XSIMD), added to cMake and makefile. +* new perftest/compare_spreads.jl compares two spreadinterp libs (A Barnett). * new benchmarker perftest/spreadtestndall sweeps all kernel widths (M Barbone). * cufinufft now supports modeord(type 1,2 only): 0 CMCL-style increasing mode order, 1 FFT-style mode order. From 82d38397ca8ec7c168bc77a1013db298f6e697da Mon Sep 17 00:00:00 2001 From: ahbarnett Date: Fri, 21 Jun 2024 20:43:41 -0400 Subject: [PATCH 33/35] ackn.rst --- docs/ackn.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/ackn.rst b/docs/ackn.rst index 17fe097c3..82ec48440 100644 --- a/docs/ackn.rst +++ b/docs/ackn.rst @@ -18,15 +18,16 @@ Major code contributions by: * Libin Lu - guru Fortran, python, MATLAB/octave, julia interfaces, cmake, CI maintenance * Joakim Andén - python, MATLAB/FFTW issues, dual-precision, performance tests, GPU version merge docs/tests * Robert Blackwell - atomic OMP add_wrapped_subgrid, GPU version merge - +* Marco Barbone - SIMD kernel vectorization, benchmarking, foldrescale, Cmake, windows build + Other significant code contributions by: * Leslie Greengard and June-Yub Lee - CMCL Fortran test drivers * Dan Foreman-Mackey - early python wrappers * David Stein - python wrappers, finding "pi-1ULP" spreadcheck error * Garrett Wright - dual-precision build, py packaging, GPU version -* Wenda Zhou - Cmake build, SIMD optims, code review, professionalization -* Martin Reinecke - SIMD acceleration of interpolator, improved binsort +* Wenda Zhou - Cmake build, code review, professionalization, SIMD ideas +* Martin Reinecke - SIMD kernel and interp auto-vectorization, binsort; other good ideas Testing, bug reports, helpful discussions: @@ -45,11 +46,10 @@ Testing, bug reports, helpful discussions: * Vladimir Rokhlin - piecewise polynomial approximation on complex boxes * Reinhard Neder - fortran90 demo using finufft as module, OSX build * Vineet Bansal - py packaging -* Marco Barbone - cmake, windows build Logo design: `Sherry Choi `_ with input from Alex Barnett and Lucy Reading-Ikkanda. We are also indebted to the authors of other NUFFT codes -such as NFFT3, CMCL NUFFT, MIRT, BART, etc, upon whose interfaces, code, +such as NFFT3, CMCL NUFFT, MIRT, BART, DUCC0, etc, upon whose interfaces, code, and algorithms we have built. From f830ba3779d599e0400634ade8ce8529fafca0ca Mon Sep 17 00:00:00 2001 From: DiamonDinoia Date: Sat, 22 Jun 2024 19:30:05 -0400 Subject: [PATCH 34/35] removed foldrescalevec --- devel/foldrescale.cpp | 60 ------------------------------------------- 1 file changed, 60 deletions(-) diff --git a/devel/foldrescale.cpp b/devel/foldrescale.cpp index afcc4e312..d05ac986a 100644 --- a/devel/foldrescale.cpp +++ b/devel/foldrescale.cpp @@ -73,18 +73,6 @@ inline __attribute__((always_inline)) FLT foldRescale03(FLT x, BIGINT N) { return result * fN; } -#ifdef __AVX2__ - -inline __attribute__((always_inline)) __m256d foldRescaleVec(__m256d x, BIGINT N) { - __m256d result; - __m256d fN = _mm256_set1_pd(FLT(N)); - static const __m256d x2pi = _mm256_set1_pd(FLT(M_1_2PI)); - static const __m256d half = _mm256_set1_pd(FLT(0.5)); - result = _mm256_fmadd_pd(x, x2pi, half); - result = _mm256_sub_pd(result, _mm256_floor_pd(result)); - return _mm256_mul_pd(result, fN); -} -#endif static std::mt19937_64 gen; static std::uniform_real_distribution<> dis(-10, 10); @@ -197,21 +185,6 @@ static void BM_FoldRescale05N(benchmark::State &state) { } } -#ifdef __AVX2__ -static void BM_FoldRescaleVec(benchmark::State &state) { - for (auto _ : state) { - // Generate 4 floating point numbers - double x1 = dis(gen); - double x2 = dis(gen); - double x3 = dis(gen); - double x4 = dis(gen); - // Pack them into an AVX vector - __m256d x = _mm256_set_pd(x1, x2, x3, x4); - // Call the foldRescaleVec function - benchmark::DoNotOptimize(foldRescaleVec(x, N)); - } -} -#endif BENCHMARK(BM_BASELINE)->Iterations(10000000); BENCHMARK(BM_FoldRescaleMacro)->Iterations(1000000); @@ -221,9 +194,6 @@ BENCHMARK(BM_FoldRescale02)->Iterations(1000000); BENCHMARK(BM_FoldRescale03)->Iterations(10000000); BENCHMARK(BM_FoldRescale04)->Iterations(1000000); BENCHMARK(BM_FoldRescale05)->Iterations(1000000); -#ifdef __AVX2__ -BENCHMARK(BM_FoldRescaleVec)->Iterations(1000000 / 4); -#endif BENCHMARK(BM_FoldRescaleMacroN)->Iterations(1000000); BENCHMARK(BM_FoldRescale00N)->Iterations(1000000); BENCHMARK(BM_FoldRescale01N)->Iterations(1000000); @@ -232,33 +202,6 @@ BENCHMARK(BM_FoldRescale03N)->Iterations(1000000); BENCHMARK(BM_FoldRescale04N)->Iterations(1000000); BENCHMARK(BM_FoldRescale05N)->Iterations(1000000); -#ifdef __AVX2__ -void testFoldRescaleVec_avx256_vs_foldRescale00() { - // Generate 4 floating point numbers - double x1 = dis(gen); - double x2 = dis(gen); - double x3 = dis(gen); - double x4 = dis(gen); - - // Pack them into an AVX vector - __m256d xVec = _mm256_set_pd(x1, x2, x3, x4); - - // Call the foldRescaleVec function - __m256d resultVec = foldRescaleVec(xVec, N); - - // Extract the results from the AVX vector - - for (int i = 0; i < 4; ++i) { - double result00 = foldRescale03(xVec[i], N); - if (std::abs(1 - result00 / resultVec[i]) > 1e-14) { - std::cout << "input: " << xVec[i] << " result00: " << result00 - << " result256: " << resultVec[i] << std::endl; - throw std::runtime_error("foldRescaleVec is not equivalent to foldRescale00"); - } - } -} -#endif - void testFoldRescaleFunctions() { for (bool p : {true}) { for (int i = 0; i < 1024; ++i) { // Run the test 1000 times @@ -341,9 +284,6 @@ int main(int argc, char **argv) { std::cout << "Seed: " << seed << "\n"; gen.seed(seed); testFoldRescaleFunctions(); -#ifdef __AVX2__ - testFoldRescaleVec_avx256_vs_foldRescale00(); -#endif ::benchmark::Initialize(&argc, argv); BaselineSubtractingReporter reporter; ::benchmark::RunSpecifiedBenchmarks(&reporter); From 6da384576d3d642436d8735f5b9a3a5b2587e77b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Joakim=20And=C3=A9n?= Date: Mon, 24 Jun 2024 13:55:07 +0200 Subject: [PATCH 35/35] docs: fix CUDA version typo in GPU install docs --- docs/install_gpu.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/install_gpu.rst b/docs/install_gpu.rst index 794b29662..360543ac1 100644 --- a/docs/install_gpu.rst +++ b/docs/install_gpu.rst @@ -5,7 +5,7 @@ Installation (GPU) .. note:: - Python users may install the cuFINUFFT package using ``pip install cufinufft``, which contains binary wheels compiled against CUDA 10.2 on Linux. If these requirements do not work for your use case, please see the detailed instructions below. + Python users may install the cuFINUFFT package using ``pip install cufinufft``, which contains binary wheels compiled against CUDA 11.2 on Linux. If these requirements do not work for your use case, please see the detailed instructions below. The GPU version of FINUFFT is called cuFINUFFT, and it uses CUDA kernels (often exploiting fast GPU shared memory)