diff --git a/.github/workflows/C++.yml b/.github/workflows/C++.yml index 31d8b203b..f65200b6b 100644 --- a/.github/workflows/C++.yml +++ b/.github/workflows/C++.yml @@ -18,10 +18,6 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install fftw - run: | - yum install -y fftw3-devel - - name: Compile C++ code run: | make spreadtestall @@ -36,9 +32,9 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install omp and fftw + - name: Install omp run: | - brew install libomp fftw + brew install libomp cp make.inc.macosx_clang make.inc - name: Compile C++ code @@ -55,9 +51,9 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install gcc and fftw + - name: Install gcc run: | - brew install gcc@10 fftw + brew install gcc@10 cp make.inc.macosx_gcc-10 make.inc - name: Compile C++ code @@ -86,7 +82,6 @@ jobs: diffutils pacboy: >- toolchain:p - fftw:p - name: Compile C++ code run: | cp make.inc.windows_msys make.inc diff --git a/.github/workflows/python_build_win.ps1 b/.github/workflows/python_build_win.ps1 index 072413d42..5b4444c4d 100644 --- a/.github/workflows/python_build_win.ps1 +++ b/.github/workflows/python_build_win.ps1 @@ -39,10 +39,6 @@ Copy-Item -Path C:\msys64\mingw64\bin\libstdc++-*.dll -Destination ([IO.Path]::C Copy-Item -Path C:\msys64\mingw64\bin\libgcc_s_seh-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) Copy-Item -Path C:\msys64\mingw64\bin\libgomp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) Copy-Item -Path C:\msys64\mingw64\bin\libwinpthread-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) -Copy-Item -Path C:\msys64\mingw64\bin\libfftw3-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) -Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) -Copy-Item -Path C:\msys64\mingw64\bin\libfftw3_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) -Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft')) New-Item -Path .\wheelhouse -ItemType Directory -Force wheel.exe pack $unpacked_wheel -d .\wheelhouse if (-not $?) {throw "Failed pack wheel"} diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml index fd0db91bb..102d33c4b 100644 --- a/.github/workflows/python_wheel.yml +++ b/.github/workflows/python_wheel.yml @@ -18,10 +18,6 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install fftw - run: | - yum install -y fftw3-devel - - name: Install ffi run: | yum install -y libffi-devel @@ -44,9 +40,9 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Install gcc and fftw + - name: Install gcc run: | - brew install gcc fftw + brew install gcc cp make.inc.macosx_gcc-8 make.inc echo "FC=gfortran-11" >> make.inc echo "CC=gcc-11" >> make.inc @@ -169,7 +165,7 @@ jobs: - uses: actions/checkout@v2 - name: Install GCC and make - run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw" + run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain" - name: Build and Test Python 3.8 uses: actions/setup-python@v2 diff --git a/.travis.yml b/.travis.yml index cd0187140..9a15984f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -12,7 +12,6 @@ addons: homebrew: packages: - ccache - - fftw - libomp update: true cache: ccache diff --git a/CMakeLists.txt b/CMakeLists.txt index 9e3ee23a7..a917d4f4b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.19) project(finufft VERSION 2.2.0 LANGUAGES C CXX) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) + set(GNU_LIKE_FRONTENDS AppleClang Clang GNU) if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS) # Set custom compiler flags for gcc-compatible compilers @@ -27,7 +30,7 @@ option(FINUFFT_BUILD_TESTS "Whether to build the FINUFFT tests" OFF) option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF) option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF) option(FINUFFT_ENABLE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." ON) -option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded. This does not affect the choice of FFTW library." ON) +option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded." ON) option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF) option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON) # sphinx tag (don't remove): @cmake_opts_end @@ -35,9 +38,6 @@ option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinuff if(FINUFFT_USE_CPU) set(CPM_DOWNLOAD_VERSION 0.38.0) include(cmake/setupCPM.cmake) - - set(FFTW_VERSION 3.3.10) - include(cmake/setupFFTW.cmake) endif() if (FINUFFT_BUILD_MATLAB) @@ -87,7 +87,8 @@ endfunction() # Utility function to set finufft compilation options. function(set_finufft_options target) set_property(TARGET ${target} PROPERTY POSITION_INDEPENDENT_CODE ON) - set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 14) + set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 17) + set_property(TARGET ${target} PROPERTY CMAKE_CUDA_STANDARD 17) enable_asan(${target}) target_compile_options(${target} PRIVATE SHELL:$<$:${FINUFFT_ARCH_FLAGS}>) @@ -96,7 +97,7 @@ function(set_finufft_options target) target_compile_options(${target} PRIVATE $<$:-fcx-limited-range>) endif () - target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") + target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/contrib") if (FINUFFT_USE_OPENMP) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) # there are issues on windows with OpenMP and CMake, so we need to manually add the flags @@ -111,17 +112,6 @@ function(set_finufft_options target) endif () endif () - # FFTW CMAKE file includes the APIs only as an install target, so we need to manually - # include them since we need them for build not for install - # trying to include them directly into the fftw and fftwf targets causes issues with - # the latest version of cmake, so we do it here instead. - if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) - list (GET FINUFFT_FFTW_LIBRARIES 0 element) - get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR) - set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api) - target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR}) - endif() - endfunction() if(FINUFFT_USE_CPU) @@ -129,13 +119,13 @@ if(FINUFFT_USE_CPU) add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) target_compile_definitions(finufft_f32 PRIVATE SINGLE) set_finufft_options(finufft_f32) - target_link_libraries(finufft_f32 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) + target_link_libraries(finufft_f32 PUBLIC) add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES}) set_finufft_options(finufft_f64) - target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES}) + target_link_libraries(finufft_f64 PUBLIC) - add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) + add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp contrib/ducc0/infra/string_utils.cc contrib/ducc0/infra/threading.cc) set_finufft_options(finufft) target_link_libraries(finufft PUBLIC finufft_f32 finufft_f64) # windows does not have a math library, so we need to exclude it @@ -144,7 +134,7 @@ if(FINUFFT_USE_CPU) endif() target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include") - add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp) + add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp contrib/ducc0/infra/string_utils.cc contrib/ducc0/infra/threading.cc) set_finufft_options(finufft) target_link_libraries(finufft_static PUBLIC finufft_f32 finufft_f64) # windows does not have a math library, so we need to exclude it @@ -161,6 +151,7 @@ if(FINUFFT_USE_CUDA) set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE) endif() enable_language(CUDA) + set(CMAKE_CUDA_STANDARD 17) find_package(CUDAToolkit REQUIRED) add_subdirectory(src/cuda) if (BUILD_TESTING AND FINUFFT_BUILD_TESTS) diff --git a/CMakePresets.json b/CMakePresets.json index 2363692b1..1cc13a3eb 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -48,10 +48,9 @@ "name": "singlethreaded", "binaryDir": "build/singlethreaded", "displayName": "singlethreaded", - "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW", + "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFT", "inherits": "default", "cacheVariables": { - "FINUFFT_FFTW_SUFFIX": "", "FINUFFT_USE_OPENMP": "OFF" } }, @@ -89,7 +88,6 @@ "description": "Build with the matlab interface", "generator": "Ninja Multi-Config", "cacheVariables": { - "FINUFFT_FFTW_SUFFIX": "Threads", "FINUFFT_BUILD_MATLAB": "ON", "FINUFFT_ENABLE_SANITIZERS": "OFF" } diff --git a/cmake/setupFFTW.cmake b/cmake/setupFFTW.cmake deleted file mode 100644 index 9a7f8c44d..000000000 --- a/cmake/setupFFTW.cmake +++ /dev/null @@ -1,62 +0,0 @@ -CPMAddPackage( - NAME findfftw - GIT_REPOSITORY "https://github.com/egpbos/findFFTW.git" - GIT_TAG "master" - EXCLUDE_FROM_ALL YES - GIT_SHALLOW YES -) - -list(APPEND CMAKE_MODULE_PATH "${findfftw_SOURCE_DIR}") - -if (FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD) - find_package(FFTW) - if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)) - if (FINUFFT_FFTW_SUFFIX STREQUAL THREADS) - set(FINUFFT_USE_THREADS ON) - else() - set(FINUFFT_USE_THREADS OFF) - endif() - CPMAddPackage( - NAME fftw3 - OPTIONS - "ENABLE_AVX2 ON" - "BUILD_TESTS OFF" - "BUILD_SHARED_LIBS OFF" - "ENABLE_THREADS ${FINUFFT_USE_THREADS}" - "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" - URL "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" - URL_HASH "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" - EXCLUDE_FROM_ALL YES - GIT_SHALLOW YES - ) - - CPMAddPackage( - NAME fftw3f - OPTIONS - "ENABLE_AVX2 ON" - "BUILD_TESTS OFF" - "BUILD_SHARED_LIBS OFF" - "ENABLE_FLOAT ON" - "ENABLE_THREADS ${FINUFFT_USE_THREADS}" - "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}" - URL "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz" - URL_HASH "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c" - EXCLUDE_FROM_ALL YES - GIT_SHALLOW YES - ) - - set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f) - if (FINUFFT_USE_THREADS) - list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads) - elseif (FINUFFT_USE_OPENMP) - list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp) - endif () - - foreach (element IN LISTS FINUFFT_FFTW_LIBRARIES) - set_property(TARGET ${element} PROPERTY POSITION_INDEPENDENT_CODE ON) - endforeach () - - else () - set(FINUFFT_FFTW_LIBRARIES "FFTW::Float" "FFTW::Double" "FFTW::Float${FINUFFT_FFTW_SUFFIX}" "FFTW::Double${FINUFFT_FFTW_SUFFIX}") - endif () -endif () \ No newline at end of file diff --git a/contrib/ducc0/fft/fft.h b/contrib/ducc0/fft/fft.h new file mode 100644 index 000000000..be270639c --- /dev/null +++ b/contrib/ducc0/fft/fft.h @@ -0,0 +1,982 @@ +/* +This file is part of the ducc FFT library + +Copyright (C) 2010-2023 Max-Planck-Society +Copyright (C) 2019 Peter Bell + +Authors: Martin Reinecke, Peter Bell +*/ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_FFT_H +#define DUCC0_FFT_H + +#include +#include +#include +#include +#include +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/infra/mav.h" +#include "ducc0/math/cmplx.h" +#include "ducc0/math/unity_roots.h" + +namespace ducc0 { + +namespace detail_fft { + +using namespace std; + +template using Troots = shared_ptr>>; +template inline auto tidx() { return type_index(typeid(T)); } + +template inline void PM(T &a, T &b, T c, T d) + { a=c+d; b=c-d; } +template inline void PMINPLACE(T &a, T &b) + { T t = a; a+=b; b=t-b; } +template inline void MPINPLACE(T &a, T &b) + { T t = a; a-=b; b=t+b; } +template void special_mul (const Cmplx &v1, const Cmplx &v2, Cmplx &res) + { + res = fwd ? Cmplx(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i) + : Cmplx(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r); + } + +struct util1d // hack to avoid duplicate symbols + { + /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */ + DUCC0_NOINLINE static size_t good_size_cmplx(size_t n) + { + if (n<=12) return n; + + size_t bestfac=2*n; + for (size_t f11=1; f11n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + /* returns the smallest composite of 2, 3, 5 which is >= n */ + DUCC0_NOINLINE static size_t good_size_real(size_t n) + { + if (n<=6) return n; + + size_t bestfac=2*n; + for (size_t f5=1; f5n) + { + if (x>=1; + } + else + return n; + } + } + return bestfac; + } + + DUCC0_NOINLINE static vector prime_factors(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + while ((N&1)==0) + { N>>=1; factors.push_back(2); } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + }; + +// T: "type", f/c: "float/complex", s/v: "scalar/vector" +template class cfftpass + { + public: + virtual ~cfftpass(){} + using Tcs = Cmplx; + + // number of Tcd values required as scratch space during "exec" + // will be provided in "buf" + virtual size_t bufsize() const = 0; + virtual bool needs_copy() const = 0; + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const = 0; + + static vector factorize(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + factors.reserve(15); + while ((N&7)==0) + { factors.push_back(8); N>>=3; } + while ((N&3)==0) + { factors.push_back(4); N>>=2; } + if ((N&1)==0) + { + N>>=1; + // factor 2 should be at the front of the factor list + factors.push_back(2); + swap(factors[0], factors.back()); + } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + + static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, + const Troots &roots, bool vectorize=false); + static shared_ptr make_pass(size_t ip, bool vectorize=false) + { + return make_pass(1,1,ip,make_shared>>(ip), + vectorize); + } + }; + +template class rfftpass + { + public: + virtual ~rfftpass(){} + + // number of Tfd values required as scratch space during "exec" + // will be provided in "buf" + virtual size_t bufsize() const = 0; + virtual bool needs_copy() const = 0; + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const = 0; + + static vector factorize(size_t N) + { + MR_assert(N>0, "need a positive number"); + vector factors; + while ((N&3)==0) + { factors.push_back(4); N>>=2; } + if ((N&1)==0) + { + N>>=1; + // factor 2 should be at the front of the factor list + factors.push_back(2); + swap(factors[0], factors.back()); + } + for (size_t divisor=3; divisor*divisor<=N; divisor+=2) + while ((N%divisor)==0) + { + factors.push_back(divisor); + N/=divisor; + } + if (N>1) factors.push_back(N); + return factors; + } + + static shared_ptr make_pass(size_t l1, size_t ido, size_t ip, + const Troots &roots, bool vectorize=false); + static shared_ptr make_pass(size_t ip, bool vectorize=false) + { + return make_pass(1,1,ip,make_shared>>(ip), + vectorize); + } + }; + +template using Tcpass = shared_ptr>; +template using Trpass = shared_ptr>; + +template class pocketfft_c + { + private: + size_t N; + size_t critbuf; + Tcpass plan; + + public: + pocketfft_c(size_t n, bool vectorize=false) + : N(n), critbuf(((N&1023)==0) ? 16 : 0), + plan(cfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); } + template DUCC0_NOINLINE Cmplx *exec(Cmplx *in, Cmplx *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tic = tidx *>(); + auto res = static_cast *>(plan->exec(tic, + in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads)); + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(Cmplx *in, Cmplx *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tic = tidx *>(); + auto res = static_cast *>(plan->exec(tic, + in, buf, buf+N*plan->needs_copy(), fwd, nthreads)); + if (res==in) + { + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec(Cmplx *in, Tfs fct, bool fwd, size_t nthreads=1) const + { + aligned_array> buf(N*plan->needs_copy()+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +template class pocketfft_r + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_r(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N*plan->needs_copy()+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, in, buf, + buf+N*plan->needs_copy(), fwd, nthreads)); + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, in, buf, + buf+N*plan->needs_copy(), fwd, nthreads)); + if (res==in) + { + if (fct!=Tfs(1)) + for (size_t i=0; i DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, + size_t nthreads=1) const + { + aligned_array buf(N*plan->needs_copy()+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +template class pocketfft_hartley + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_hartley(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, + in, buf, buf+N, true, nthreads)); + auto res2 = (res==buf) ? in : buf; + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, size_t nthreads=1) const + { + auto res = exec(in, buf, fct, nthreads); + if (res!=in) + copy_n(res, N, in); + } + template DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, + size_t nthreads=1) const + { + aligned_array buf(N+plan->bufsize()); + exec_copyback(in, buf.data(), fct, nthreads); + } + }; + +template class pocketfft_fht + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_fht(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = static_cast(plan->exec(tifd, + in, buf, buf+N, true, nthreads)); + auto res2 = (res==buf) ? in : buf; + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, size_t nthreads=1) const + { + auto res = exec(in, buf, fct, nthreads); + if (res!=in) + copy_n(res, N, in); + } + template DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, + size_t nthreads=1) const + { + aligned_array buf(N+plan->bufsize()); + exec_copyback(in, buf.data(), fct, nthreads); + } + }; + +// R2R transforms using FFTW's halfcomplex format +template class pocketfft_fftw + { + private: + size_t N; + Trpass plan; + + public: + pocketfft_fftw(size_t n, bool vectorize=false) + : N(n), plan(rfftpass::make_pass(n,vectorize)) {} + size_t length() const { return N; } + size_t bufsize() const { return N+plan->bufsize(); } + template DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct, + bool fwd, size_t nthreads=1) const + { + static const auto tifd = tidx(); + auto res = in; + auto res2 = buf; + if (!fwd) // go to FFTPACK halfcomplex order + { + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i(plan->exec(tifd, + res, res2, buf+N, fwd, nthreads)); + if (!fwd) return res; + + // go to FFTW halfcomplex order + res2 = (res==buf) ? in : buf; + res2[0] = fct*res[0]; + size_t i=1, i1=1, i2=N-1; + for (i=1; i DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf, + Tfs fct, bool fwd, size_t nthreads=1) const + { + auto res = exec(in, buf, fct, fwd, nthreads); + if (res!=in) + copy_n(res, N, in); + } + template DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd, + size_t nthreads=1) const + { + aligned_array buf(N+plan->bufsize()); + exec_copyback(in, buf.data(), fct, fwd, nthreads); + } + }; + +// +// sine/cosine transforms +// + +template class T_dct1 + { + private: + pocketfft_r fftplan; + + public: + DUCC0_NOINLINE T_dct1(size_t length, bool /*vectorize*/=false) + : fftplan(2*(length-1)) {} + + template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=fftplan.length(), n=N/2+1; + if (ortho) + { c[0]*=sqrt2; c[n-1]*=sqrt2; } + auto tmp=&buf[0]; + tmp[0] = c[0]; + for (size_t i=1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + exec(c, buf, fct, ortho, 1, true, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, + int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + aligned_array buf(bufsize()); + exec_copyback(c, buf.data(), fct, ortho, 1, true, nthreads); + } + + size_t length() const { return fftplan.length()/2+1; } + size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } + }; + +template class T_dst1 + { + private: + pocketfft_r fftplan; + + public: + DUCC0_NOINLINE T_dst1(size_t length, bool /*vectorize*/=false) + : fftplan(2*(length+1)) {} + + template DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + size_t N=fftplan.length(), n=N/2-1; + auto tmp = &buf[0]; + tmp[0] = tmp[n+1] = c[0]*0; + for (size_t i=0; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const + { + exec(c, buf, fct, true, 1, false, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads) const + { + aligned_array buf(bufsize()); + exec_copyback(c, buf.data(), fct, true, 1, false, nthreads); + } + + size_t length() const { return fftplan.length()/2-1; } + size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); } + }; + +template class T_dcst23 + { + private: + pocketfft_r fftplan; + std::vector twiddle; + + public: + DUCC0_NOINLINE T_dcst23(size_t length, bool /*vectorize*/=false) + : fftplan(length), twiddle(length) + { + UnityRoots> tw(4*length); + for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho, + int type, bool cosine, size_t nthreads=1) const + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + size_t N=length(); + size_t NS2 = (N+1)/2; + if (type==2) + { + c[0] *= 2; + if ((N&1)==0) c[N-1]*=2; + if (cosine) + for (size_t k=1; k DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool ortho, int type, bool cosine, size_t nthreads=1) const + { + exec(c, buf, fct, ortho, type, cosine, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho, + int type, bool cosine, size_t nthreads=1) const + { + aligned_array buf(bufsize()); + exec(c, &buf[0], fct, ortho, type, cosine, nthreads); + } + + size_t length() const { return fftplan.length(); } + size_t bufsize() const { return fftplan.bufsize(); } + }; + +template class T_dcst4 + { + private: + size_t N; + std::unique_ptr> fft; + std::unique_ptr> rfft; + aligned_array> C2; + size_t bufsz; + + public: + DUCC0_NOINLINE T_dcst4(size_t length, bool /*vectorize*/=false) + : N(length), + fft((N&1) ? nullptr : make_unique>(N/2)), + rfft((N&1)? make_unique>(N) : nullptr), + C2((N&1) ? 0 : N/2), + bufsz((N&1) ? (N+rfft->bufsize()) : (N+2*fft->bufsize())) + { + if ((N&1)==0) + { + UnityRoots> tw(16*N); + for (size_t i=0; i DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads) const + { + size_t n2 = N/2; + if (!cosine) + for (size_t k=0, kc=N-1; kexec(y, y+N, fct, true, nthreads); + { + auto SGN = [](size_t i) + { + constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L); + return (i&2) ? -sqrt2 : sqrt2; + }; + c[n2] = res[0]*SGN(n2+1); + size_t i=0, i1=1, k=1; + for (; k *>(buf); + for(size_t i=0; iexec(y2, y2+N/2, fct, true, nthreads); + for(size_t i=0, ic=n2-1; i DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const + { + exec(c, buf, fct, true, 4, cosine, nthreads); + } + template DUCC0_NOINLINE void exec(T c[], T0 fct, + bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const + { + aligned_array buf(bufsize()); + exec(c, &buf[0], fct, true, 4, cosine, nthreads); + } + + size_t length() const { return N; } + size_t bufsize() const { return bufsz; } + }; + +using shape_t=fmav_info::shape_t; +using stride_t=fmav_info::stride_t; + +constexpr bool FORWARD = true, + BACKWARD = false; + +/// Complex-to-complex Fast Fourier Transform +/** This executes a Fast Fourier Transform on \a in and stores the result in + * \a out. + * + * \a in and \a out must have identical shapes; they may point to the same + * memory; in this case their strides must also be identical. + * + * \a axes specifies the axes over which the transform is carried out. + * + * If \a forward is true, a minus sign will be used in the exponent. + * + * No normalization factors will be applied by default; if multiplication by + * a constant is desired, it can be supplied in \a fct. + * + * If the underlying array has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void c2c(const cfmav> &in, + const vfmav> &out, const shape_t &axes, bool forward, + T fct, size_t nthreads=1); + +/// Fast Discrete Cosine Transform +/** This executes a DCT on \a in and stores the result in \a out. + * + * \a in and \a out must have identical shapes; they may point to the same + * memory; in this case their strides must also be identical. + * + * \a axes specifies the axes over which the transform is carried out. + * + * If \a forward is true, a DCT is computed, otherwise an inverse DCT. + * + * \a type specifies the desired type (1-4) of the transform. + * + * No normalization factors will be applied by default; if multiplication by + * a constant is desired, it can be supplied in \a fct. + * + * If \a ortho is true, the first and last array entries are corrected (if + * necessary) to allow an orthonormalized transform. + * + * If the underlying array has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void dct(const cfmav &in, const vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1); + +/// Fast Discrete Sine Transform +/** This executes a DST on \a in and stores the result in \a out. + * + * \a in and \a out must have identical shapes; they may point to the same + * memory; in this case their strides must also be identical. + * + * \a axes specifies the axes over which the transform is carried out. + * + * If \a forward is true, a DST is computed, otherwise an inverse DST. + * + * \a type specifies the desired type (1-4) of the transform. + * + * No normalization factors will be applied by default; if multiplication by + * a constant is desired, it can be supplied in \a fct. + * + * If \a ortho is true, the first and last array entries are corrected (if + * necessary) to allow an orthonormalized transform. + * + * If the underlying array has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void dst(const cfmav &in, const vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1); + +template DUCC0_NOINLINE void r2c(const cfmav &in, + const vfmav> &out, size_t axis, bool forward, T fct, + size_t nthreads=1); + +template DUCC0_NOINLINE void r2c(const cfmav &in, + const vfmav> &out, const shape_t &axes, + bool forward, T fct, size_t nthreads=1); + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + const vfmav &out, size_t axis, bool forward, T fct, size_t nthreads=1); + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + const vfmav &out, const shape_t &axes, bool forward, T fct, + size_t nthreads=1); + +template DUCC0_NOINLINE void c2r_mut(const vfmav> &in, + const vfmav &out, const shape_t &axes, bool forward, T fct, + size_t nthreads=1); + +template DUCC0_NOINLINE void r2r_fftpack(const cfmav &in, + const vfmav &out, const shape_t &axes, bool real2hermitian, bool forward, + T fct, size_t nthreads=1); + +template DUCC0_NOINLINE void r2r_fftw(const cfmav &in, + const vfmav &out, const shape_t &axes, bool forward, + T fct, size_t nthreads=1); + +template DUCC0_NOINLINE void r2r_separable_hartley(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads=1); + +template DUCC0_NOINLINE void r2r_separable_fht(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads=1); + +template void r2r_genuine_hartley(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads=1); + +template void r2r_genuine_fht(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads=1); + +/// Convolution and zero-padding/truncation along one axis +/** This performs a circular convolution with the kernel \a kernel on axis + * \a axis of \a in, applies the necessary zero-padding/truncation on this + * axis to give it the length \a out.shape(axis),and returns the result + * in \a out. + * + * The main purpose of this routine is efficiency: the combination of the above + * operations can be carried out more quickly than running the individual + * operations in succession. + * + * \a in and \a out must have identical shapes, with the possible exception + * of the axis \a axis; they may point to the same memory; in this case all + * of their strides must be identical. + * + * \a axis specifies the axis over which the operation is carried out. + * + * \a kernel must have the same length as \a in.shape(axis); it must be + * provided in the same domain as \a in (i.e. not pre-transformed). + * + * If \a in has more than one dimension, the computation will + * be distributed over \a nthreads threads. + */ +template DUCC0_NOINLINE void convolve_axis(const cfmav &in, + const vfmav &out, size_t axis, const cmav &kernel, size_t nthreads=1); + +template DUCC0_NOINLINE void convolve_axis(const cfmav> &in, + const vfmav> &out, size_t axis, const cmav,1> &kernel, + size_t nthreads=1); +} + +using detail_fft::pocketfft_c; +using detail_fft::pocketfft_r; +using detail_fft::pocketfft_hartley; +using detail_fft::pocketfft_fht; +using detail_fft::pocketfft_fftw; + +using detail_fft::FORWARD; +using detail_fft::BACKWARD; +using detail_fft::c2c; +using detail_fft::c2r; +using detail_fft::c2r_mut; +using detail_fft::r2c; +using detail_fft::r2r_fftpack; +using detail_fft::r2r_fftw; +using detail_fft::r2r_separable_hartley; +using detail_fft::r2r_genuine_hartley; +using detail_fft::r2r_separable_fht; +using detail_fft::r2r_genuine_fht; +using detail_fft::dct; +using detail_fft::dst; +using detail_fft::convolve_axis; + +inline size_t good_size_complex(size_t n) + { return detail_fft::util1d::good_size_cmplx(n); } +inline size_t good_size_real(size_t n) + { return detail_fft::util1d::good_size_real(n); } + +} + +#endif diff --git a/contrib/ducc0/fft/fft1d_impl.h b/contrib/ducc0/fft/fft1d_impl.h new file mode 100644 index 000000000..f2bf69361 --- /dev/null +++ b/contrib/ducc0/fft/fft1d_impl.h @@ -0,0 +1,2990 @@ +/* +This file is part of the ducc FFT library + +Copyright (C) 2010-2023 Max-Planck-Society +Copyright (C) 2019 Peter Bell + +For the odd-sized DCT-IV transforms: + Copyright (C) 2003, 2007-14 Matteo Frigo + Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology + +Authors: Martin Reinecke, Peter Bell +*/ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_FFT1D_IMPL_H +#define DUCC0_FFT1D_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/useful_macros.h" +#include "ducc0/math/cmplx.h" +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/infra/simd.h" +#include "ducc0/infra/threading.h" +#include "ducc0/math/unity_roots.h" +#include "ducc0/fft/fft.h" + +namespace ducc0 { + +namespace detail_fft { + +using namespace std; + +// the next line is necessary to address some sloppy name choices in hipSYCL +using std::min, std::max; + +template constexpr inline size_t fft1d_simdlen + = min(8, native_simd::size()); +template<> constexpr inline size_t fft1d_simdlen + = min(4, native_simd::size()); +template<> constexpr inline size_t fft1d_simdlen + = min(8, native_simd::size()); +template using fft1d_simd = typename simd_select>::type; +template constexpr inline bool fft1d_simd_exists = (fft1d_simdlen > 1); + +// Always use std:: for functions +template T cos(T) = delete; +template T sin(T) = delete; +template T sqrt(T) = delete; + +template void ROTX90(Cmplx &a) + { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; } + +#define POCKETFFT_EXEC_DISPATCH \ + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ + bool fwd, size_t nthreads=1) const \ + { \ + static const auto tics = tidx(); \ + if (ti==tics) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + if constexpr (fft1d_simdlen > 1) \ + if constexpr (simd_exists>) \ + { \ + using Tfv = typename simd_select>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 2) \ + if constexpr (simd_exists/2>) \ + { \ + using Tfv = typename simd_select/2>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 4) \ + if constexpr (simd_exists/4>) \ + { \ + using Tfv = typename simd_select/4>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 8) \ + if constexpr (simd_exists/8>) \ + { \ + using Tfv = typename simd_select/8>::type; \ + using Tcv = Cmplx; \ + static const auto ticv = tidx(); \ + if (ti==ticv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + MR_fail("impossible vector length requested"); \ + } + +template class cfftp1: public cfftpass + { + public: + cfftp1() {} + virtual size_t bufsize() const { return 0; } + virtual bool needs_copy() const { return false; } + + virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, + void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const + { return in; } + }; + +template class cfftp2: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=2; + aligned_array wa; + + auto WA(size_t i) const + { return wa[i-1]; } + + template Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc, + Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const + { + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tcd& + { return ch[b+l1*c]; }; + auto CC = [cc](size_t b, size_t c) -> const Tcd& + { return cc[b+ip*c]; }; + for (size_t k=0; k Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + for (size_t k=0; k(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1)); + } + } + return ch; + } + } + + public: + cfftp2(size_t l1_, size_t ido_, const Troots &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp3: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=3; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r=-0.5, + tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP3(idx) \ + Tcd t0 = CC(idx,0,k), t1, t2; \ + PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \ + CH(idx,k,0)=t0+t1; +#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \ + { \ + Tcd ca=t0+t1*twr; \ + Tcd cb{-t2.i*twi, t2.r*twi}; \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\ + } +#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \ + { \ + Tcd ca=t0+t1*twr; \ + Tcd cb{-t2.i*twi, t2.r*twi}; \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp4: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=4; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tcd& + { return ch[b+l1*c]; }; + auto CC = [cc](size_t b, size_t c) -> const Tcd& + { return cc[b+ip*c]; }; + for (size_t k=0; k(t4); + PM(CH(k,0),CH(k,2),t2,t3); + PM(CH(k,1),CH(k,3),t1,t4); + } + } + else + { + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + for (size_t k=0; k(t4); + PM(CH(0,k,0),CH(0,k,2),t2,t3); + PM(CH(0,k,1),CH(0,k,3),t1,t4); + } + for (size_t i=1; i(t4); + CH(i,k,0) = t2+t3; + special_mul(t1+t4,WA(0,i),CH(i,k,1)); + special_mul(t2-t3,WA(1,i),CH(i,k,2)); + special_mul(t1-t4,WA(2,i),CH(i,k,3)); + } + } + } + return ch; + } + + public: + cfftp4(size_t l1_, size_t ido_, const Troots &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp5: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=5; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L), + tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L), + tw2r= Tfs(-0.8090169943749474241022934171828191L), + tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP5(idx) \ + Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \ + PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \ + PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \ + CH(idx,k,0).r=t0.r+t1.r+t2.r; \ + CH(idx,k,0).i=t0.i+t1.i+t2.i; + +#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \ + { \ + Tcd ca,cb; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \ + } + +#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \ + { \ + Tcd ca,cb,da,db; \ + ca.r=t0.r+twar*t1.r+twbr*t2.r; \ + ca.i=t0.i+twar*t1.i+twbr*t2.i; \ + cb.i=twai*t4.r twbi*t3.r; \ + cb.r=-(twai*t4.i twbi*t3.i); \ + special_mul(ca+cb,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(ca-cb,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp7: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=7; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L), + tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L), + tw2r= Tfs(-0.2225209339563144042889025644967948L), + tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L), + tw3r= Tfs(-0.9009688679024191262361023195074451L), + tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP7(idx) \ + Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \ + PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \ + PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \ + PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \ + CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \ + CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i; + +#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \ + { \ + Tcd ca,cb; \ + ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \ + ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \ + cb.i=y1*t7.r y2*t6.r y3*t5.r; \ + cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \ + PM(out1,out2,ca,cb); \ + } +#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \ + { \ + Tcd da,db; \ + POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftp8: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=8; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template void ROTX45(T &a) const + { + constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); + if constexpr (fwd) + { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); } + else + { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); } + } + template void ROTX135(T &a) const + { + constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); + if constexpr (fwd) + { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); } + else + { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); } + } + + template Tcd *exec_ + (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const + { + if (l1==1) + { + auto CC = [cc,this](size_t a, size_t b) -> Tcd& + { return cc[a+ido*b]; }; + { + Tcd a0, a1, a2, a3, a4, a5, a6, a7; + PM(a1,a5,CC(0,1),CC(0,5)); + PM(a3,a7,CC(0,3),CC(0,7)); + PMINPLACE(a1,a3); + ROTX90(a3); + + ROTX90(a7); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + + PM(a0,a4,CC(0,0),CC(0,4)); + PM(a2,a6,CC(0,2),CC(0,6)); + PM(CC(0,0),CC(0,4),a0+a2,a1); + PM(CC(0,2),CC(0,6),a0-a2,a3); + ROTX90(a6); + PM(CC(0,1),CC(0,5),a4+a6,a5); + PM(CC(0,3),CC(0,7),a4-a6,a7); + } + for (size_t i=1; i(a7); + PMINPLACE(a1,a3); + ROTX90(a3); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + PM(a0,a4,CC(i,0),CC(i,4)); + PM(a2,a6,CC(i,2),CC(i,6)); + PMINPLACE(a0,a2); + CC(i,0) = a0+a1; + special_mul(a0-a1,WA(3,i),CC(i,4)); + special_mul(a2+a3,WA(1,i),CC(i,2)); + special_mul(a2-a3,WA(5,i),CC(i,6)); + ROTX90(a6); + PMINPLACE(a4,a6); + special_mul(a4+a5,WA(0,i),CC(i,1)); + special_mul(a4-a5,WA(4,i),CC(i,5)); + special_mul(a6+a7,WA(2,i),CC(i,3)); + special_mul(a6-a7,WA(6,i),CC(i,7)); + } + return cc; + } + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tcd& + { return ch[b+l1*c]; }; + auto CC = [cc](size_t b, size_t c) -> const Tcd& + { return cc[b+ip*c]; }; + for (size_t k=0; k(a3); + + ROTX90(a7); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + + PM(a0,a4,CC(0,k),CC(4,k)); + PM(a2,a6,CC(2,k),CC(6,k)); + PM(CH(k,0),CH(k,4),a0+a2,a1); + PM(CH(k,2),CH(k,6),a0-a2,a3); + ROTX90(a6); + PM(CH(k,1),CH(k,5),a4+a6,a5); + PM(CH(k,3),CH(k,7),a4-a6,a7); + } + } + else + { + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + for (size_t k=0; k(a3); + + ROTX90(a7); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + + PM(a0,a4,CC(0,0,k),CC(0,4,k)); + PM(a2,a6,CC(0,2,k),CC(0,6,k)); + PM(CH(0,k,0),CH(0,k,4),a0+a2,a1); + PM(CH(0,k,2),CH(0,k,6),a0-a2,a3); + ROTX90(a6); + PM(CH(0,k,1),CH(0,k,5),a4+a6,a5); + PM(CH(0,k,3),CH(0,k,7),a4-a6,a7); + } + for (size_t i=1; i(a7); + PMINPLACE(a1,a3); + ROTX90(a3); + PMINPLACE(a5,a7); + ROTX45(a5); + ROTX135(a7); + PM(a0,a4,CC(i,0,k),CC(i,4,k)); + PM(a2,a6,CC(i,2,k),CC(i,6,k)); + PMINPLACE(a0,a2); + CH(i,k,0) = a0+a1; + special_mul(a0-a1,WA(3,i),CH(i,k,4)); + special_mul(a2+a3,WA(1,i),CH(i,k,2)); + special_mul(a2-a3,WA(5,i),CH(i,k,6)); + ROTX90(a6); + PMINPLACE(a4,a6); + special_mul(a4+a5,WA(0,i),CH(i,k,1)); + special_mul(a4-a5,WA(4,i),CH(i,k,5)); + special_mul(a6+a7,WA(2,i),CH(i,k,3)); + special_mul(a6-a7,WA(6,i),CH(i,k,7)); + } + } + } + return ch; + } + + public: + cfftp8(size_t l1_, size_t ido_, const Troots &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i1; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class cfftp11: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + static constexpr size_t ip=11; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[x+(i-1)*(ip-1)]; } + + template [[gnu::hot]] Tcd *exec_ + (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, + size_t /*nthreads*/) const + { + constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L), + tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L), + tw2r= Tfs(0.4154150130018864255292741492296232L), + tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L), + tw3r= Tfs(-0.1423148382732851404437926686163697L), + tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L), + tw4r= Tfs(-0.6548607339452850640569250724662936L), + tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L), + tw5r= Tfs(-0.9594929736144973898903680570663277L), + tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L); + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +#define POCKETFFT_PREP11(idx) \ + Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \ + PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \ + PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \ + PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \ + PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \ + PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \ + CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \ + CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i; + +#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \ + { \ + Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \ + cb; \ + cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \ + cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \ + PM(out1,out2,ca,cb); \ + } +#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2)) +#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \ + { \ + Tcd da,db; \ + POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \ + special_mul(da,WA(u1-1,i),CH(i,k,u1)); \ + special_mul(db,WA(u2-1,i),CH(i,k,u2)); \ + } + + if (ido==1) + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t i=1; i class cfftpg: public cfftpass + { + private: + using typename cfftpass::Tcs; + + size_t l1, ido; + size_t ip; + aligned_array wa; + aligned_array csarr; + + auto WA(size_t x, size_t i) const + { return wa[i-1+x*(ido-1)]; } + + template Tcd *exec_ + (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const + { + size_t ipph = (ip+1)/2; + size_t idl1 = ido*l1; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd& + { return cc[a+ido*(b+ip*c)]; }; + auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd& + { return cc[a+ido*(b+l1*c)]; }; + auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd& + { return ch[a+idl1*b]; }; + + for (size_t k=0; kip) iwal-=ip; + Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; + iwal+=l; if (iwal>ip) iwal-=ip; + Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal]; + for (size_t ik=0; ikip) iwal-=ip; + Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal]; + for (size_t ik=0; ik(x1,wa[idij],CX(i,k,j)); + idij=(jc-1)*(ido-1)+i-1; + special_mul(x2,wa[idij],CX(i,k,jc)); + } + } + } + return cc; + } + + public: + cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots &roots) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip) + { + MR_assert((ip&1)&&(ip>=5), "need an odd number >=5"); + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class cfftpblue: public cfftpass + { + private: + using typename cfftpass::Tcs; + + const size_t l1, ido, ip; + const size_t ip2; + const Tcpass subplan; + aligned_array wa, bk, bkf; + size_t bufsz; + bool need_cpy; + + auto WA(size_t x, size_t i) const + { return wa[i-1+x*(ido-1)]; } + + template Tcd *exec_ + (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, + Tcd * DUCC0_RESTRICT buf, size_t nthreads) const + { + static const auto ti=tidx(); + Tcd *akf = &buf[0]; + Tcd *akf2 = subplan->needs_copy() ? (&buf[ip2]) : akf; + Tcd *subbuf = akf2+ip2; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd& + { return cc[a+ido*(b+ip*c)]; }; + +//FIXME: parallelize here? + for (size_t k=0; k(CC(i,m,k),bk[m],akf[m]); + auto zero = akf[0]*Tfs(0); + for (size_t m=ip; m(subplan->exec(ti,akf,akf2, + subbuf, true, nthreads)); + + /* do the convolution */ + res[0] = res[0].template special_mul(bkf[0]); + for (size_t m=1; m<(ip2+1)/2; ++m) + { + res[m] = res[m].template special_mul(bkf[m]); + res[ip2-m] = res[ip2-m].template special_mul(bkf[m]); + } + if ((ip2&1)==0) + res[ip2/2] = res[ip2/2].template special_mul(bkf[ip2/2]); + + /* inverse FFT */ + res = static_cast(subplan->exec(ti, res, + (res==akf) ? akf2 : akf, subbuf, false, nthreads)); + + /* multiply by b_k and write to output buffer */ + if (l1>1) + { + if (i==0) + for (size_t m=0; m(bk[m]); + else + { + CH(i,k,0) = res[0].template special_mul(bk[0]); + for (size_t m=1; m(bk[m]*WA(m-1,i)); + } + } + else + { + if (i==0) + for (size_t m=0; m(bk[m]); + else + { + CC(i,0,0) = res[0].template special_mul(bk[0]); + for (size_t m=1; m(bk[m]*WA(m-1,i)); + } + } + } + + return (l1>1) ? ch : cc; + } + + public: + cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots &roots, + bool vectorize=false) + : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)), + subplan(cfftpass::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)), + bk(ip), bkf(ip2/2+1) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; jsize()/(2*ip))*2*ip==roots->size()) ? + roots : make_shared>(2*ip); + size_t rfct2 = roots2->size()/(2*ip); + for (size_t m=1; m=2*ip) coeff-=2*ip; + bk[m] = (*roots2)[coeff*rfct2]; + } + + /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */ + aligned_array tbkf(ip2), tbkf2(ip2); + Tfs xn2 = Tfs(1)/Tfs(ip2); + tbkf[0] = bk[0]*xn2; + for (size_t m=1; m buf(subplan->bufsize()); + static const auto tics=tidx(); + auto res = static_cast(subplan->exec(tics, tbkf.data(), + tbkf2.data(), buf.data(), true)); + for (size_t i=0; i1; + bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize(); + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class cfft_multipass: public cfftpass + { + private: + using typename cfftpass::Tcs; + static constexpr size_t bunchsize=8; + + const size_t l1, ido; + size_t ip; + vector> passes; + size_t bufsz; + bool need_cpy; + size_t rfct; + Troots myroots; + +// FIXME split into sub-functions. This is too long! + template Cmplx *exec_(Cmplx *cc, Cmplx *ch, + Cmplx *buf, size_t nthreads) const + { + using Tc = Cmplx; + if ((l1==1) && (ido==1)) // no chance at vectorizing + { + static const auto tic=tidx(); + Tc *p1=cc, *p2=ch; + for(const auto &pass: passes) + { + auto res = static_cast(pass->exec(tic, p1, p2, buf, + fwd, nthreads)); + if (res==p2) swap (p1,p2); + } + return p1; + } + else + { + if constexpr(is_same::value && fft1d_simd_exists) // we can vectorize! + { + using Tfv = fft1d_simd; + using Tcv = Cmplx; + constexpr size_t vlen = Tfv::size(); + size_t nvtrans = (l1*ido + vlen-1)/vlen; + // NOTE: removed "static" here, because it leads to trouble with gcc 7 + // static const type_index ticv = tidx(); + const type_index ticv = tidx(); + + if (ido==1) + { + auto CH = [ch,this](size_t b, size_t c) -> Tc& + { return ch[b+l1*c]; }; + auto CC = [cc,this](size_t b, size_t c) -> Tc& + { return cc[b+ip*c]; }; + + execStatic(nvtrans, nthreads, 0, [&](auto &sched) + { + aligned_array tbuf(2*ip+32+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip+16]; + auto buf2 = &tbuf[2*ip+32]; + + while (auto rng=sched.getNext()) + for(auto itrans=rng.lo; itrans(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m Tc& + { return cc[a+ido*b]; }; + + execStatic(nvtrans, nthreads, 0, [&](auto &sched) + { + aligned_array tbuf(2*ip+32+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip+16]; + auto buf2 = &tbuf[2*ip+32]; + + while (auto rng=sched.getNext()) + for(auto itrans=rng.lo; itrans(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m= ido) break; + if (i==0) + CC(0,m) = { p1[m].r[n], p1[m].i[n] }; + else + { + if (m==0) + CC(i,0) = { p1[0].r[n], p1[0].i[n] } ; + else + CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*m*i]); + } + } + } + }); + return cc; + } + +MR_fail("must not get here"); +#if 0 +//FIXME this code path is currently unused + aligned_array tbuf(2*ip+bufsize()); + auto cc2 = &tbuf[0]; + auto ch2 = &tbuf[ip]; + auto buf2 = &tbuf[2*ip]; + + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& + { return cc[a+ido*(b+ip*c)]; }; + +//FIXME parallelize? + for (size_t itrans=0; itrans ix, kx; + size_t ixcur = (itrans*vlen)%ido; + size_t kxcur = (itrans*vlen)/ido; + for (size_t n=0; n(pass->exec(ticv, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + + for (size_t m=0; m= l1*ido) break; + if (i==0) + CH(0,k,m) = { p1[m].r[n], p1[m].i[n] }; + else + { + if (m==0) + CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ; + else + CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul((*myroots)[rfct*l1*m*i]); + } + } + } + return ch; +#endif + } + else + { + static const auto tic = tidx *>(); + if (ido==1) + { +// parallelize here! + for (size_t n=0; n *p1=&cc[n*ip], *p2=ch; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf, fwd)); + if (res==p2) swap (p1,p2); + } + if (res != &cc[n*ip]) + copy(res, res+ip, cc+n*ip); + } + // transpose + size_t nbunch = (l1*ido + bunchsize-1)/bunchsize; +// parallelize here! + for (size_t ibunch=0; ibunch Tc& + { return cc[a+ido*b]; }; + +// parallelize here! + for (size_t ibunch=0; ibunch *p1=&cc2[n*ip], *p2=ch2; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + if (res==&cc2[n*ip]) // no copying necessary + { + if (i!=0) + { + for (size_t m=1; m((*myroots)[rfct*m*i]); + } + } + else + { + if (i==0) + for (size_t m=0; m((*myroots)[rfct*m*i]); + } + } + } + for (size_t m=0; m Tc& + { return ch[a+ido*(b+l1*c)]; }; + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc& + { return cc[a+ido*(b+ip*c)]; }; + +// parallelize here! + for (size_t ibunch=0; ibunch ix, kx; + size_t ixcur = (ibunch*bunchsize)%ido; + size_t kxcur = (ibunch*bunchsize)/ido; + for (size_t n=0; n *p1=&cc2[n*ip], *p2=ch2; + Cmplx *res = nullptr; + for(const auto &pass: passes) + { + res = static_cast *>(pass->exec(tic, + p1, p2, buf2, fwd)); + if (res==p2) swap (p1,p2); + } + if (res==&cc2[n*ip]) // no copying necessary + { + if (i!=0) + { + for (size_t m=1; m((*myroots)[rfct*l1*m*i]); + } + } + else + { + if (i==0) + for (size_t m=0; m((*myroots)[rfct*l1*m*i]); + } + } + } + for (size_t m=0; m &roots, bool /*vectorize*/=false) + : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), + myroots(roots) + { + size_t N=ip*l1*ido; + rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + + // FIXME TBD +// do we need the vectorize flag at all? + size_t lim = 10000; //vectorize ? 10000 : 10000; + if (ip<=lim) + { + auto factors = cfftpass::factorize(ip); + size_t l1l=1; + for (auto fct: factors) + { + passes.push_back(cfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots, false)); + l1l*=fct; + } + } + else + { + vector packets(2,1); + auto factors = util1d::prime_factors(ip); + sort(factors.begin(), factors.end(), std::greater()); + for (auto fct: factors) + (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct; + size_t l1l=1; + for (auto pkt: packets) + { + passes.push_back(cfftpass::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false)); + l1l*=pkt; + } + } + for (const auto &pass: passes) + { + bufsz = max(bufsz, pass->bufsize()); + need_cpy |= pass->needs_copy(); + } + if ((l1!=1)||(ido!=1)) + { + need_cpy=true; + bufsz += (bunchsize+1)*ip; + } + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +#undef POCKETFFT_EXEC_DISPATCH + +template class cfftp_vecpass: public cfftpass + { + private: + static_assert(simd_exists, "bad vlen"); + using typename cfftpass::Tcs; + using Tfv=typename simd_select::type; + using Tcv=Cmplx; + + size_t ip; + Tcpass spass; + Tcpass vpass; + size_t bufsz; + + template Tcs *exec_ (Tcs *cc, + Tcs * /*ch*/, Tcs *sbuf, size_t nthreads) const + { + char *xbuf = reinterpret_cast(sbuf); + size_t misalign = reinterpret_cast(xbuf)&(sizeof(Tfv)-1); + if (misalign != 0) + xbuf += sizeof(Tfv)-misalign; + Tcv *buf = reinterpret_cast(xbuf); + auto * cc2 = buf; + auto * ch2 = buf+ip/vlen+7; + auto * buf2 = buf+2*ip/vlen+7+7; + static const auto tics = tidx(); +// run scalar pass + auto res = static_cast(spass->exec(tics, cc, + reinterpret_cast(ch2), reinterpret_cast(buf2), + fwd, nthreads)); +// arrange input in SIMD-friendly way, must be done out-of-place + for (size_t i=0; i(); + auto res2 = static_cast(vpass->exec(ticv, + cc2, ch2, buf2, fwd, nthreads)); +// de-SIMDify, can be done pseudo-inplace + for (size_t i=0; i &roots) + : ip(ip_), spass(cfftpass::make_pass(1, ip/vlen, vlen, roots)), + vpass(cfftpass::make_pass(1, 1, ip/vlen, roots)), bufsz(0) + { + MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size"); + bufsz = 2*(ip/vlen)+7+7; + bufsz += max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); // buffers for subpasses + bufsz *= vlen; // since we specify in terms of Tcs + bufsz += vlen; // wiggle room for alignment shifts + } + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return false; } + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, + bool fwd, size_t nthreads=1) const + { + static const auto tics = tidx(); + MR_assert(ti==tics, "bad input type"); + auto in1 = static_cast(in); + auto copy1 = static_cast(copy); + auto buf1 = static_cast(buf); + return fwd ? exec_(in1, copy1, buf1, nthreads) + : exec_(in1, copy1, buf1, nthreads); + } + }; + +template Tcpass cfftpass::make_pass(size_t l1, + size_t ido, size_t ip, const Troots &roots, bool vectorize) + { + MR_assert(ip>=1, "no zero-sized FFTs"); + // do we have an 1D vectorizable FFT? + if (vectorize && (ip>300)&& (ip<=100000) && (l1==1) && (ido==1)) + { +// constexpr auto vlen = native_simd::size(); +// if constexpr(vlen>=4) + constexpr auto vlen = 4; + if constexpr(simd_exists) + if ((ip&(vlen-1))==0) + return make_shared>(ip, roots); + } + + if (ip==1) return make_shared>(); + auto factors=cfftpass::factorize(ip); + if (factors.size()==1) + { + switch(ip) + { + case 2: + return make_shared>(l1, ido, roots); + case 3: + return make_shared>(l1, ido, roots); + case 4: + return make_shared>(l1, ido, roots); + case 5: + return make_shared>(l1, ido, roots); + case 7: + return make_shared>(l1, ido, roots); + case 8: + return make_shared>(l1, ido, roots); + case 11: + return make_shared>(l1, ido, roots); + default: + if (ip<110) + return make_shared>(l1, ido, ip, roots); + else + return make_shared>(l1, ido, ip, roots, vectorize); + } + } + else // more than one factor, need a multipass + return make_shared>(l1, ido, ip, roots, vectorize); + } + +#define POCKETFFT_EXEC_DISPATCH \ + virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \ + bool fwd, size_t nthreads) const \ + { \ + static const auto tifs=tidx(); \ + if (ti==tifs) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + if constexpr (fft1d_simdlen > 1) \ + if constexpr (simd_exists>) \ + { \ + using Tfv = typename simd_select>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 2) \ + if constexpr (simd_exists/2>) \ + { \ + using Tfv = typename simd_select/2>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 4) \ + if constexpr (simd_exists/4>) \ + { \ + using Tfv = typename simd_select/4>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + if constexpr (fft1d_simdlen > 8) \ + if constexpr (simd_exists/8>) \ + { \ + using Tfv = typename simd_select/8>::type; \ + static const auto tifv=tidx(); \ + if (ti==tifv) \ + { \ + auto in1 = static_cast(in); \ + auto copy1 = static_cast(copy); \ + auto buf1 = static_cast(buf); \ + return fwd ? exec_(in1, copy1, buf1, nthreads) \ + : exec_(in1, copy1, buf1, nthreads); \ + } \ + } \ + MR_fail("impossible vector length requested"); \ + } + +/* (a+ib) = conj(c+id) * (e+if) */ +template inline void MULPM + (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f) + { a=c*e+d*f; b=c*f-d*e; } + +template class rfftp1: public rfftpass + { + public: + rfftp1() {} + virtual size_t bufsize() const { return 0; } + virtual bool needs_copy() const { return false; } + + virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/, + void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const + { return in; } + }; + +template class rfftp2: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=2; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp3: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=3; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + constexpr Tfs taur=Tfs(-0.5), + taui=Tfs(0.8660254037844386467637231707529362L); + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp4: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=4; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L); + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftp5: public rfftpass + { + private: + size_t l1, ido; + static constexpr size_t ip=5; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L), + ti11= Tfs(0.9510565162951535721164393333793821L), + tr12= Tfs(-0.8090169943749474241022934171828191L), + ti12= Tfs(0.5877852522924731291687059546390728L); + + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k &roots) + : l1(l1_), ido(ido_), wa((ip-1)*(ido-1)) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftpg: public rfftpass + { + private: + size_t l1, ido; + size_t ip; + aligned_array wa, csarr; + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const + { + if constexpr(fwd) + { + size_t ipph=(ip+1)/2; + size_t idl1 = ido*l1; + + auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd& + { return ch[a+idl1*b]; }; + + if (ido>1) + { + for (size_t j=1, jc=ip-1; j=ip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if (iang>=ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ik=ip) iang-=ip; + Tfs ar=csarr[2*iang], ai=csarr[2*iang+1]; + for (size_t ik=0; ik const Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd& + { return cc[a+idl1*b]; }; + auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd& + { return ch[a+idl1*b]; }; + + for (size_t k=0; kip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1]; + iang+=l; if(iang>ip) iang-=ip; + Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1]; + for (size_t ik=0; ikip) iang-=ip; + Tfs war=csarr[2*iang], wai=csarr[2*iang+1]; + for (size_t ik=0; ik &roots) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip) + { + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + size_t rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j class rfftpblue: public rfftpass + { + private: + const size_t l1, ido, ip; + aligned_array wa; + const Tcpass cplan; + size_t bufsz; + bool need_cpy; + + auto WA(size_t x, size_t i) const + { return wa[i+x*(ido-1)]; } + + template Tfd *exec_ + (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch, + Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const + { + using Tcd = Cmplx; + auto buf = reinterpret_cast(buf_); + Tcd *cc2 = &buf[0]; + Tcd *ch2 = &buf[ip]; + Tcd *subbuf = &buf[2*ip]; + static const auto ticd = tidx(); + + if constexpr(fwd) + { + auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd& + { return cc[a+ido*(b+l1*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+ip*c)]; }; + + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + // copy out + CH(0,0,k) = res[0].r; + for (size_t m=1; m<=ip/2; ++m) + { + CH(ido-1,2*m-1,k)=res[m].r; + CH(0,2*m,k)=res[m].i; + } + } + if (ido==1) return ch; + size_t ipph = (ip+1)/2; + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + CH(i-1,0,k) = res[0].r; + CH(i,0,k) = res[0].i; + for (size_t m=1; m Tfd& + { return cc[a+ido*(b+ip*c)]; }; + auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd& + { return ch[a+ido*(b+l1*c)]; }; + + for (size_t k=0; k(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + for (size_t m=0; m(cplan->exec(ticd, cc2, ch2, + subbuf, fwd, nthreads)); + CH(i-1,k,0) = res[0].r; + CH(i,k,0) = res[0].i; + for (size_t m=1; m &roots, bool vectorize=false) + : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), + cplan(cfftpass::make_pass(1,1,ip,roots,vectorize)) + { + MR_assert(ip&1, "Bluestein length must be odd"); + MR_assert(ido&1, "ido must be odd"); + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; jbufsize(); } + virtual bool needs_copy() const { return true; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class rfft_multipass: public rfftpass + { + private: + const size_t l1, ido; + size_t ip; + vector> passes; + size_t bufsz; + bool need_cpy; + aligned_array wa; + + auto WA(size_t x, size_t i) const + { return wa[(i-1)*(ip-1)+x]; } + + template Tfd *exec_(Tfd *cc, Tfd *ch, Tfd *buf, + size_t nthreads) const + { + static const auto tifd = tidx(); + if ((l1==1) && (ido==1)) + { + Tfd *p1=cc, *p2=ch; + if constexpr (fwd) + for (auto it=passes.rbegin(); it!=passes.rend(); ++it) + { + auto res = static_cast((*it)->exec(tifd, + p1, p2, buf, fwd, nthreads)); + if (res==p2) swap(p1,p2); + } + else + for (const auto &pass: passes) + { + auto res = static_cast(pass->exec(tifd, + p1, p2, buf, fwd, nthreads)); + if (res==p2) swap(p1,p2); + } + return p1; + } + else + MR_fail("not yet supported"); + } + + public: + rfft_multipass(size_t l1_, size_t ido_, size_t ip_, + const Troots &roots, bool /*vectorize*/=false) + : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false), + wa((ip-1)*(ido-1)) + { + size_t N=ip*l1*ido; + auto rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + for (size_t j=1; j::factorize(ip); + + size_t l1l=1; + for (auto fct: factors) + { + passes.push_back(rfftpass::make_pass(l1l, ip/(fct*l1l), fct, roots)); + l1l*=fct; + } + for (const auto &pass: passes) + { + bufsz = max(bufsz, pass->bufsize()); + need_cpy |= pass->needs_copy(); + } + if ((l1!=1)||(ido!=1)) + { + need_cpy=true; + bufsz += 2*ip; + } + } + + virtual size_t bufsize() const { return bufsz; } + virtual bool needs_copy() const { return need_cpy; } + + POCKETFFT_EXEC_DISPATCH + }; + +template class rfftp_complexify: public rfftpass + { + private: + size_t N; + Troots roots; + size_t rfct; + Tcpass pass; + size_t l1, ido; + static constexpr size_t ip=2; + + template Tfd *exec_ (Tfd * DUCC0_RESTRICT cc, + Tfd * DUCC0_RESTRICT ch, Tfd * buf, size_t nthreads) const + { + using Tcd = Cmplx; + auto ccc = reinterpret_cast(cc); + auto cch = reinterpret_cast(ch); + auto cbuf = reinterpret_cast(buf); + static const auto ticd = tidx(); + if constexpr(fwd) + { + auto res = static_cast(pass->exec(ticd, + ccc, cch, cbuf, true, nthreads)); + auto rres = (res==ccc) ? ch : cc; + rres[0] = res[0].r+res[0].i; +//FIXME: parallelize? + for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) + { + auto xe = res[i]+res[xi].conj(); + auto xo = Tcd(res[i].i+res[xi].i, res[xi].r-res[i].r) + * (*roots)[rfct*i].conj(); + rres[2*i-1] = Tfs(0.5)*(xe.r+xo.r); + rres[2*i] = Tfs(0.5)*(xe.i+xo.i); + rres[2*xi-1] = Tfs(0.5)*(xe.r-xo.r); + rres[2*xi] = Tfs(0.5)*(xo.i-xe.i); + } + rres[N-1] = res[0].r-res[0].i; + return rres; + } + else + { + cch[0] = Tcd(cc[0]+cc[N-1], cc[0]-cc[N-1]); +//FIXME: parallelize? + for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi) + { + Tcd t1 (cc[2*i-1], cc[2*i]); + Tcd t2 (cc[2*xi-1], -cc[2*xi]); + auto xe = t1+t2; + auto xo = (t1-t2)*(*roots)[rfct*i]; + cch[i] = (xe + Tcd(-xo.i, xo.r)); + cch[xi] = (xe.conj() + Tcd(xo.i, xo.r)); + } + auto res = static_cast(pass->exec(ticd, + cch, ccc, cbuf, false, nthreads)); + return (res==ccc) ? cc : ch; + } + } + + public: + rfftp_complexify(size_t N_, const Troots &roots_, bool vectorize=false) + : N(N_), roots(roots_), pass(cfftpass::make_pass(N/2, vectorize)) + { + rfct = roots->size()/N; + MR_assert(roots->size()==N*rfct, "mismatch"); + MR_assert((N&1)==0, "N must be even"); + } + + virtual size_t bufsize() const { return 2*pass->bufsize(); } + virtual bool needs_copy() const { return true; } + + POCKETFFT_EXEC_DISPATCH + }; +#undef POCKETFFT_EXEC_DISPATCH + +template Trpass rfftpass::make_pass(size_t l1, + size_t ido, size_t ip, const Troots &roots, bool vectorize) + { + MR_assert(ip>=1, "no zero-sized FFTs"); + if (ip==1) return make_shared>(); + if ((ip>1000) && ((ip&1)==0)) // use complex transform + { + bool doit = vectorize&&((ip&7)==0); // vecpass might be beneficial + doit |= ip>10000; // complex multipass might be beneficial + if (!doit) + { + auto factors = rfftpass::factorize(ip); + for (auto factor: factors) + // complex Bluestein or larger prime factor functions might be beneficial + if (factor>5) { doit=true; break; } + } + if (doit) + return make_shared>(ip, roots, vectorize); + } + auto factors=rfftpass::factorize(ip); + if (factors.size()==1) + { + switch(ip) + { + case 2: + return make_shared>(l1, ido, roots); + case 3: + return make_shared>(l1, ido, roots); + case 4: + return make_shared>(l1, ido, roots); + case 5: + return make_shared>(l1, ido, roots); + default: + if (ip<135) + return make_shared>(l1, ido, ip, roots); + else + return make_shared>(l1, ido, ip, roots, vectorize); + } + } + else // more than one factor, need a multipass + return make_shared>(l1, ido, ip, roots, vectorize); + } + +}} + +#endif diff --git a/contrib/ducc0/fft/fftnd_impl.h b/contrib/ducc0/fft/fftnd_impl.h new file mode 100644 index 000000000..1ab98027d --- /dev/null +++ b/contrib/ducc0/fft/fftnd_impl.h @@ -0,0 +1,1828 @@ +/* +This file is part of the ducc FFT library. + +Copyright (C) 2010-2023 Max-Planck-Society +Copyright (C) 2019 Peter Bell + +Authors: Martin Reinecke, Peter Bell +*/ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_FFTND_IMPL_H +#define DUCC0_FFTND_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/useful_macros.h" +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/threading.h" +#include "ducc0/infra/misc_utils.h" +#include "ducc0/infra/simd.h" +#include "ducc0/infra/mav.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/math/cmplx.h" +#include "ducc0/math/unity_roots.h" +#include "ducc0/fft/fft1d_impl.h" + +/** \file fft.h + * Implementation of multi-dimensional Fast Fourier and related transforms + * \copyright Copyright (C) 2010-2021 Max-Planck-Society + * \copyright Copyright (C) 2019 Peter Bell + * \copyright + * \copyright For the odd-sized DCT-IV transforms: + * \copyright Copyright (C) 2003, 2007-14 Matteo Frigo + * \copyright Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology + * + * \authors Martin Reinecke, Peter Bell + */ + +namespace ducc0 { + +namespace detail_fft { + +// the next line is necessary to address some sloppy name choices in hipSYCL +using std::min, std::max; + +template constexpr inline size_t fft_simdlen + = min(8, native_simd::size()); +template<> constexpr inline size_t fft_simdlen + = min(4, native_simd::size()); +template<> constexpr inline size_t fft_simdlen + = min(8, native_simd::size()); +template using fft_simd = typename simd_select>::type; +template constexpr inline bool fft_simd_exists = (fft_simdlen > 1); + +struct util // hack to avoid duplicate symbols + { + static void sanity_check_axes(size_t ndim, const shape_t &axes) + { + if (ndim==1) + { + if ((axes.size()!=1) || (axes[0]!=0)) + throw std::invalid_argument("bad axes"); + return; + } + shape_t tmp(ndim,0); + if (axes.empty()) throw std::invalid_argument("no axes specified"); + for (auto ax : axes) + { + if (ax>=ndim) throw std::invalid_argument("bad axis number"); + if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly"); + } + } + + DUCC0_NOINLINE static void sanity_check_onetype(const fmav_info &a1, + const fmav_info &a2, bool inplace, const shape_t &axes) + { + sanity_check_axes(a1.ndim(), axes); + MR_assert(a1.conformable(a2), "array sizes are not conformable"); + if (inplace) MR_assert(a1.stride()==a2.stride(), "stride mismatch"); + } + DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac, + const fmav_info &ar, const shape_t &axes) + { + sanity_check_axes(ac.ndim(), axes); + MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); + for (size_t i=0; i=ac.ndim()) throw std::invalid_argument("bad axis number"); + MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch"); + for (size_t i=0; i std::shared_ptr get_plan(size_t length, bool vectorize=false) + { +#ifdef DUCC0_NO_FFT_CACHE + return std::make_shared(length, vectorize); +#else + constexpr size_t nmax=10; + struct entry { size_t n; bool vectorize; std::shared_ptr ptr; }; + static std::array cache{{{0,0,nullptr}}}; + static std::array last_access{{0}}; + static size_t access_counter = 0; + static Mutex mut; + + auto find_in_cache = [&]() -> std::shared_ptr + { + for (size_t i=0; i(length, vectorize); + { + LockGuard lock(mut); + + auto p = find_in_cache(); + if (p) return p; + + size_t lru = 0; + for (size_t i=1; i class multi_iter + { + private: + shape_t shp, pos; + stride_t str_i, str_o; + size_t cshp_i, cshp_o, rem; + ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N]; + bool uni_i, uni_o; + + void advance_i() + { + for (size_t i=0; i=1, "not enough dimensions"); + // Sort the extraneous dimensions in order of ascending output stride; + // this should improve overall cache re-use and avoid clashes between + // threads as much as possible. + shape_t idx(iarr.ndim()); + std::iota(idx.begin(), idx.end(), 0); + sort(idx.begin(), idx.end(), + [&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);}); + for (auto i: idx) + if (i!=idim) + { + pos.push_back(0); + MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch"); + shp.push_back(iarr.shape(i)); + str_i.push_back(iarr.stride(i)); + str_o.push_back(oarr.stride(i)); + } + MR_assert(idim0) + { + sstr_i = str_i[0]; + sstr_o = str_o[0]; + } + + if (nshares==1) return; + if (nshares==0) throw std::runtime_error("can't run with zero threads"); + if (myshare>=nshares) throw std::runtime_error("impossible share requested"); + auto [lo, hi] = calcShare(nshares, myshare, rem); + size_t todo = hi-lo; + + size_t chunk = rem; + for (size_t i2=0, i=pos.size()-1; i2(stride_in() *tsz)&4095)==0) + || ((abs(stride_out()*tsz)&4095)==0); + } + bool critical_stride_other(size_t tsz) const + { + if (unistride_i()==0) return false; // it's just one transform + return ((abs(unistride_i()*tsz)&4095)==0) + || ((abs(unistride_o()*tsz)&4095)==0); + } + }; + +template class TmpStorage + { + private: + aligned_array d; + size_t dofs, dstride; + + public: + TmpStorage(size_t n_trafo, size_t bufsize_data, size_t bufsize_trafo, + size_t n_simultaneous, bool inplace) + { + if (inplace) + { + d.resize(bufsize_trafo); + return; + } + constexpr auto vlen = fft_simdlen; + // FIXME: when switching to C++20, use bit_floor(othersize) + size_t buffct = std::min(vlen, n_trafo); + size_t datafct = std::min(vlen, n_trafo); + if (n_trafo>=n_simultaneous*vlen) datafct = n_simultaneous*vlen; + dstride = bufsize_data; + dofs = bufsize_trafo; + // critical stride avoidance + if ((dstride&256)==0) dstride+=16; + if ((dofs&256)==0) dofs += 16; + d.resize(buffct*dofs + datafct*dstride); + } + + template T2 *transformBuf() + { return reinterpret_cast(d.data()); } + template T2 *dataBuf() + { return reinterpret_cast(d.data()) + dofs; } + size_t data_stride() const + { return dstride; } + }; + +template class TmpStorage2 + { + private: + TmpStorage &stg; + + public: + using datatype = T2; + TmpStorage2(TmpStorage &stg_): stg(stg_) {} + + T2 *transformBuf() { return stg.template transformBuf(); } + T2 *dataBuf() { return stg.template dataBuf(); } + size_t data_stride() const { return stg.data_stride(); } + }; + +template DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx *DUCC0_RESTRICT dst) + { + constexpr auto vlen=Tsimd::size(); + const Cmplx * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i tmp; + for (size_t j=0; j DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, Tsimd *DUCC0_RESTRICT dst) + { + constexpr auto vlen=Tsimd::size(); + const typename Tsimd::value_type * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, T *DUCC0_RESTRICT dst) + { + const T * DUCC0_RESTRICT ptr = src.data(); + if (dst == &src.raw(it.iofs(0))) return; // in-place + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx *DUCC0_RESTRICT src, const vfmav> &dst) + { + constexpr auto vlen=Tsimd::size(); + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i tmp(src[i]); + for (size_t j=0; j DUCC0_NOINLINE void copy_output(const Titer &it, + const Tsimd *DUCC0_RESTRICT src, const vfmav &dst) + { + constexpr auto vlen=Tsimd::size(); + typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const T *DUCC0_RESTRICT src, const vfmav &dst) + { + T * DUCC0_RESTRICT ptr=dst.data(); + if (src == &dst.raw(it.oofs(0))) return; // in-place + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx * DUCC0_RESTRICT dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + const Cmplx * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav> &src, Cmplx * DUCC0_RESTRICT dst, size_t nvec, size_t vstr) + { + const Cmplx * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, Tsimd * DUCC0_RESTRICT dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + const typename Tsimd::value_type * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_input(const Titer &it, + const cfmav &src, T * DUCC0_RESTRICT dst, size_t nvec, size_t vstr) + { + const T * DUCC0_RESTRICT ptr = src.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx * DUCC0_RESTRICT src, const vfmav> &dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i tmp(src[j0*vstr+i]); + for (size_t j1=0; j1 DUCC0_NOINLINE void copy_output(const Titer &it, + const Cmplx * DUCC0_RESTRICT src, const vfmav> &dst, size_t nvec, size_t vstr) + { + Cmplx * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const Tsimd * DUCC0_RESTRICT src, const vfmav &dst, size_t nvec, size_t vstr) + { + constexpr auto vlen=Tsimd::size(); + typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i DUCC0_NOINLINE void copy_output(const Titer &it, + const T * DUCC0_RESTRICT src, const vfmav &dst, size_t nvec, size_t vstr) + { + T * DUCC0_RESTRICT ptr = dst.data(); + for (size_t i=0; i struct add_vec + { using type = typename simd_select::type; }; +template struct add_vec, vlen> + { using type = Cmplx::type>; }; +template using add_vec_t = typename add_vec::type; + +template +DUCC0_NOINLINE void general_nd(const cfmav &in, const vfmav &out, + const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec, + const bool /*allow_inplace*/=true) + { + if ((in.ndim()==1)&&(in.stride(0)==1)&&(out.stride(0)==1)) + { + auto plan = get_plan(in.shape(0), true); + exec.exec_simple(in.data(), out.data(), *plan, fct, nthreads); + return; + } + std::shared_ptr plan, vplan; + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + + for (size_t iax=0; iaxlength())) + { + plan = get_plan(len, in.ndim()==1); + vplan = ((in.ndim()==1)||(len<300)||((len&3)!=0)) ? + plan : get_plan(len, true); + } + + execParallel(util::thread_count(nthreads, in, axes[iax], fft_simdlen), + [&](Scheduler &sched) + { + constexpr auto vlen = fft_simdlen; + constexpr size_t nmax = 16; + const auto &tin(iax==0? in : out); + multi_iter it(tin, out, axes[iax], sched.num_threads(), sched.thread_num()); + + // n_simul: vector size + // n_bunch: total size of bunch (multiple of n_simul) + size_t n_simul=1, n_bunch=1; + bool critstride = (((in.stride(axes[iax])*sizeof(T))&4095)==0) + || (((out.stride(axes[iax])*sizeof(T))&4095)==0); + bool nostride = (in.stride(axes[iax])==1) && (out.stride(axes[iax])==1); + + constexpr size_t l2cache=262144*2; + constexpr size_t cacheline=64; + + // working set size + auto wss = [&](size_t vl) { return sizeof(T)*(2*len*vl + plan->bufsize()); }; + // is the FFT small enough to fit into L2 vectorized? + if (wss(1)>l2cache) // "long" FFT, don't execute more than one at the same time + { + n_simul=1; + if (critstride) // make bunch large to reduce overall copy cost + { + n_bunch=n_simul; + while ((n_bunch storage(in.size()/len, len, max(plan->bufsize(),vplan->bufsize()), (n_bunch+vlen-1)/vlen, inplace); + + // first, do all possible steps of size n_bunch, then n_simul + if (n_bunch>1) + { +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + constexpr size_t lvlen = vlen; + if (n_simul>=lvlen) + { + if ((n_bunch>n_simul) && (it.remaining()>=n_bunch)) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=n_bunch) + { + it.advance(n_bunch); + exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); + } + } + } + if (n_simul==lvlen) + { + if (it.remaining()>=lvlen) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=lvlen) + { + it.advance(lvlen); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } + } + } + if constexpr ((vlen>2) && (simd_exists)) + { + constexpr size_t lvlen = vlen/2; + if (n_simul>=lvlen) + { + if ((n_bunch>n_simul) && (it.remaining()>=n_bunch)) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=n_bunch) + { + it.advance(n_bunch); + exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); + } + } + } + if (n_simul==lvlen) + { + if (it.remaining()>=lvlen) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=lvlen) + { + it.advance(lvlen); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } + } + } + if constexpr ((vlen>4) && (simd_exists)) + { + constexpr size_t lvlen = vlen/4; + if (n_simul>=lvlen) + { + if ((n_bunch>n_simul) && (it.remaining()>=n_bunch)) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=n_bunch) + { + it.advance(n_bunch); + exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d); + } + } + } + if (n_simul==lvlen) + { + if (it.remaining()>=lvlen) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=lvlen) + { + it.advance(lvlen); + exec(it, tin, out, storage2, *plan, fct, nth1d); + } + } + } + } +#endif + { + TmpStorage2 storage2(storage); + while ((n_bunch>n_simul) && (it.remaining()>=n_bunch)) + { + it.advance(n_bunch); + exec.exec_n(it, tin, out, storage2, *vplan, fct, n_bunch, nth1d); + } + } + } + { + TmpStorage2 storage2(storage); + while (it.remaining()>0) + { + it.advance(1); + exec(it, tin, out, storage2, *vplan, fct, nth1d, inplace); + } + } + }); // end of parallel region + fct = T0(1); // factor has been applied, use 1 for remaining axes + } + } + +struct ExecC2C + { + bool forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav> &in, + const vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, + size_t nthreads, bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same, T>::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()+it.oofs(0)); + plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, forward, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav> &in, + const vfmav> &out, Tstorage &storage, const pocketfft_c &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const Cmplx *in, Cmplx *out, const pocketfft_c &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, forward, nthreads); + } + }; + +struct ExecHartley + { + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, const vfmav &out, + Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()+it.oofs(0)); + plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const pocketfft_hartley &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_hartley &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, nthreads); + } + }; + +struct ExecFHT + { + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, const vfmav &out, + Tstorage &storage, const pocketfft_fht &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()+it.oofs(0)); + plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const pocketfft_fht &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_fht &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, nthreads); + } + }; + +struct ExecFFTW + { + bool forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, const vfmav &out, + Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()+it.oofs(0)); + plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, forward, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const pocketfft_fftw &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_fftw &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, forward, nthreads); + } + }; + +struct ExecDcst + { + bool ortho; + int type; + bool cosine; + + template + DUCC0_NOINLINE void operator() (const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + if (in.data()!=out.data()) + copy_input(it, in, out.data()+it.oofs(0)); + plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, ortho, type, cosine, nthreads); + return; + } + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine, nthreads); + copy_output(it, res, out); + } + template DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + for (size_t i=0; i DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const Tplan &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + plan.exec(out, fct, ortho, type, cosine, nthreads); + } + }; + +template DUCC0_NOINLINE void general_r2c( + const cfmav &in, const vfmav> &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + auto plan = std::make_unique>(in.shape(axis)); + size_t len=in.shape(axis); + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(in.size()/len, len, plan->bufsize(), 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>=vlen) + { + it.advance(vlen); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/2); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j4) + if constexpr( simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/4); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + for (size_t j=0; j storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>0) + { + it.advance(1); + copy_input(it, in, dbuf); + auto res = plan->exec(dbuf, tbuf, fct, true, nth1d); + auto vout = out.data(); + vout[it.oofs(0)].Set(res[0]); + size_t i=1, ii=1; + if (forward) + for (; i DUCC0_NOINLINE void general_c2r( + const cfmav> &in, const vfmav &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + size_t nth1d = (in.ndim()==1) ? nthreads : 1; + auto plan = std::make_unique>(out.shape(axis)); + size_t len=out.shape(axis); + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(out.size()/len, len, plan->bufsize(), 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>=vlen) + { + it.advance(vlen); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + } + if constexpr (vlen>2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/2); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + if constexpr (vlen>4) + if constexpr(simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T> storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + it.advance(vlen/4); + for (size_t j=0; jexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } +#endif + { + TmpStorage2 storage2(storage); + auto dbuf = storage2.dataBuf(); + auto tbuf = storage2.transformBuf(); + while (it.remaining()>0) + { + it.advance(1); + dbuf[0]=in.raw(it.iofs(0)).r; + { + size_t i=1, ii=1; + if (forward) + for (; iexec(dbuf, tbuf, fct, false, nth1d); + copy_output(it, res, out); + } + } + }); // end of parallel region + } + +struct ExecR2R + { + bool r2c, forward; + + template DUCC0_NOINLINE void operator() ( + const Titer &it, const cfmav &in, const vfmav &out, Tstorage &storage, + const pocketfft_r &plan, T0 fct, size_t nthreads, + bool inplace=false) const + { + using T = typename Tstorage::datatype; + if constexpr(is_same::value) + if (inplace) + { + T *buf1=storage.transformBuf(), *buf2=out.data()+it.oofs(0); + if (in.data()!=buf2) + copy_input(it, in, buf2); + if ((!r2c) && forward) + for (size_t i=2; i DUCC0_NOINLINE void exec_n ( + const Titer &it, const cfmav &in, + const vfmav &out, Tstorage &storage, const pocketfft_r &plan, T0 fct, size_t nvec, + size_t nthreads) const + { + using T = typename Tstorage::datatype; + size_t dstr = storage.data_stride(); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2, nvec, dstr); + if ((!r2c) && forward) + for (size_t k=0; k DUCC0_NOINLINE void exec_simple ( + const T0 *in, T0 *out, const pocketfft_r &plan, T0 fct, + size_t nthreads) const + { + if (in!=out) copy_n(in, plan.length(), out); + if ((!r2c) && forward) + for (size_t i=2; i DUCC0_NOINLINE void c2c(const cfmav> &in, + const vfmav> &out, const shape_t &axes, bool forward, + T fct, size_t nthreads) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const auto &in2(reinterpret_cast >&>(in)); + const auto &out2(reinterpret_cast >&>(out)); + if ((axes.size()>1) && (in.data()!=out.data())) // optimize axis order + { + if ((in.stride(axes[0])!=1)&&(out.stride(axes[0])==1)) + { + shape_t axes2(axes); + swap(axes2[0],axes2.back()); + general_nd>(in2, out2, axes2, fct, nthreads, ExecC2C{forward}); + return; + } + for (size_t i=1; i>(in2, out2, axes2, fct, nthreads, ExecC2C{forward}); + return; + } + } + general_nd>(in2, out2, axes, fct, nthreads, ExecC2C{forward}); + } + +template DUCC0_NOINLINE void dct(const cfmav &in, const vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type"); + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const ExecDcst exec{ortho, type, true}; + if (type==1) + general_nd>(in, out, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(in, out, axes, fct, nthreads, exec); + else + general_nd>(in, out, axes, fct, nthreads, exec); + } + +template DUCC0_NOINLINE void dst(const cfmav &in, const vfmav &out, + const shape_t &axes, int type, T fct, bool ortho, size_t nthreads) + { + if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type"); + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + const ExecDcst exec{ortho, type, false}; + if (type==1) + general_nd>(in, out, axes, fct, nthreads, exec); + else if (type==4) + general_nd>(in, out, axes, fct, nthreads, exec); + else + general_nd>(in, out, axes, fct, nthreads, exec); + } + +template DUCC0_NOINLINE void r2c(const cfmav &in, + const vfmav> &out, size_t axis, bool forward, T fct, + size_t nthreads) + { + util::sanity_check_cr(out, in, axis); + if (in.size()==0) return; + const auto &out2(reinterpret_cast>&>(out)); + general_r2c(in, out2, axis, forward, fct, nthreads); + } + +template DUCC0_NOINLINE void r2c(const cfmav &in, + const vfmav> &out, const shape_t &axes, + bool forward, T fct, size_t nthreads) + { + util::sanity_check_cr(out, in, axes); + if (in.size()==0) return; + r2c(in, out, axes.back(), forward, fct, nthreads); + if (axes.size()==1) return; + + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(out, out, newaxes, forward, T(1), nthreads); + } + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + const vfmav &out, size_t axis, bool forward, T fct, size_t nthreads) + { + util::sanity_check_cr(in, out, axis); + if (in.size()==0) return; + const auto &in2(reinterpret_cast>&>(in)); + general_c2r(in2, out, axis, forward, fct, nthreads); + } + +template DUCC0_NOINLINE void c2r(const cfmav> &in, + const vfmav &out, const shape_t &axes, bool forward, T fct, + size_t nthreads) + { + if (axes.size()==1) + return c2r(in, out, axes[0], forward, fct, nthreads); + util::sanity_check_cr(in, out, axes); + if (in.size()==0) return; + auto atmp(vfmav>::build_noncritical(in.shape(), UNINITIALIZED)); + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(in, atmp, newaxes, forward, T(1), nthreads); + c2r(atmp, out, axes.back(), forward, fct, nthreads); + } + +template DUCC0_NOINLINE void c2r_mut(const vfmav> &in, + const vfmav &out, const shape_t &axes, bool forward, T fct, + size_t nthreads) + { + if (axes.size()==1) + return c2r(in, out, axes[0], forward, fct, nthreads); + util::sanity_check_cr(in, out, axes); + if (in.size()==0) return; + auto newaxes = shape_t{axes.begin(), --axes.end()}; + c2c(in, in, newaxes, forward, T(1), nthreads); + c2r(in, out, axes.back(), forward, fct, nthreads); + } + +template DUCC0_NOINLINE void r2r_fftpack(const cfmav &in, + const vfmav &out, const shape_t &axes, bool real2hermitian, bool forward, + T fct, size_t nthreads) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecR2R{real2hermitian, forward}); + } + +template DUCC0_NOINLINE void r2r_fftw(const cfmav &in, + const vfmav &out, const shape_t &axes, bool forward, + T fct, size_t nthreads) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecFFTW{forward}); + } + +template DUCC0_NOINLINE void r2r_separable_hartley(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecHartley{}, false); + } + +template DUCC0_NOINLINE void r2r_separable_fht(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads) + { + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + general_nd>(in, out, axes, fct, nthreads, + ExecFHT{}, false); + } + +template void hermiteHelper(size_t idim, ptrdiff_t iin, + ptrdiff_t iout0, ptrdiff_t iout1, const cfmav &c, + const vfmav &r, const shape_t &axes, Func func, size_t nthreads) + { + auto cstr=c.stride(idim), str=r.stride(idim); + auto len=r.shape(idim); + + if (idim+1==c.ndim()) // last dimension, not much gain in parallelizing + { + if (idim==axes.back()) // halfcomplex axis + for (size_t i=0,ic=0; i void oscarize(const vfmav &data, size_t ax0, size_t ax1, + size_t nthreads) + { + auto nu=data.shape(ax0), nv=data.shape(ax1); + if ((nu<3)||(nv<3)) return; + vector slc(data.ndim()); + slc[ax0] = slice(1,(nu+1)/2); + slc[ax1] = slice(1,(nv+1)/2); + auto all = subarray(data, slc); + slc[ax0] = slice(nu-1,nu/2,-1); + auto ahl = subarray(data, slc); + slc[ax1] = slice(nv-1,nv/2,-1); + auto ahh = subarray(data, slc); + slc[ax0] = slice(1,(nu+1)/2); + auto alh = subarray(data, slc); + mav_apply([](T &ll, T &hl, T &hh, T &lh) + { + T tll=ll, thl=hl, tlh=lh, thh=hh; + T v = T(0.5)*(tll+tlh+thl+thh); + ll = v-thh; + hl = v-tlh; + lh = v-thl; + hh = v-tll; + }, nthreads, all, ahl, ahh, alh); + } + +template void r2r_genuine_hartley(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads) + { + if (axes.size()==1) + return r2r_separable_hartley(in, out, axes, fct, nthreads); + if (axes.size()==2) + { + r2r_separable_hartley(in, out, axes, fct, nthreads); + oscarize(out, axes[0], axes[1], nthreads); + return; + } + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + shape_t tshp(in.shape()); + tshp[axes.back()] = tshp[axes.back()]/2+1; + auto atmp(vfmav>::build_noncritical(tshp, UNINITIALIZED)); + r2c(in, atmp, axes, true, fct, nthreads); + hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex &c, T &r0, T &r1) + { + r0 = c.real()+c.imag(); + r1 = c.real()-c.imag(); + }, nthreads); + } + +template void r2r_genuine_fht(const cfmav &in, + const vfmav &out, const shape_t &axes, T fct, size_t nthreads) + { + if (axes.size()==1) + return r2r_separable_fht(in, out, axes, fct, nthreads); + if (axes.size()==2) + { + r2r_separable_fht(in, out, axes, fct, nthreads); + oscarize(out, axes[0], axes[1], nthreads); + return; + } + util::sanity_check_onetype(in, out, in.data()==out.data(), axes); + if (in.size()==0) return; + shape_t tshp(in.shape()); + tshp[axes.back()] = tshp[axes.back()]/2+1; + auto atmp(vfmav>::build_noncritical(tshp, UNINITIALIZED)); + r2c(in, atmp, axes, true, fct, nthreads); + hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex &c, T &r0, T &r1) + { + r0 = c.real()-c.imag(); + r1 = c.real()+c.imag(); + }, nthreads); + } + +template +DUCC0_NOINLINE void general_convolve_axis(const cfmav &in, const vfmav &out, + const size_t axis, const cmav &kernel, size_t nthreads, + const Exec &exec) + { + std::unique_ptr plan1, plan2; + + size_t l_in=in.shape(axis), l_out=out.shape(axis); + MR_assert(kernel.size()==l_in, "bad kernel size"); + plan1 = std::make_unique(l_in); + plan2 = std::make_unique(l_out); + size_t bufsz = max(plan1->bufsize(), plan2->bufsize()); + + vmav fkernel({kernel.shape(0)}, UNINITIALIZED); + for (size_t i=0; iexec(fkernel.data(), T0(1)/T0(l_in), true, nthreads); + + execParallel( + util::thread_count(nthreads, in, axis, fft_simdlen), + [&](Scheduler &sched) { + constexpr auto vlen = fft_simdlen; + TmpStorage storage(in.size()/l_in, l_in+l_out, bufsz, 1, false); + multi_iter it(in, out, axis, sched.num_threads(), sched.thread_num()); +#ifndef DUCC0_NO_SIMD + if constexpr (vlen>1) + { + TmpStorage2,T,T0> storage2(storage); + while (it.remaining()>=vlen) + { + it.advance(vlen); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + } + if constexpr (vlen>2) + if constexpr (simd_exists) + if (it.remaining()>=vlen/2) + { + TmpStorage2,T,T0> storage2(storage); + it.advance(vlen/2); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + if constexpr (vlen>4) + if constexpr (simd_exists) + if (it.remaining()>=vlen/4) + { + TmpStorage2,T,T0> storage2(storage); + it.advance(vlen/4); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } +#endif + { + TmpStorage2 storage2(storage); + while (it.remaining()>0) + { + it.advance(1); + exec(it, in, out, storage2, *plan1, *plan2, fkernel); + } + } + }); // end of parallel region + } + +struct ExecConv1R + { + template void operator() ( + const Titer &it, const cfmav &in, const vfmav &out, + Tstorage &storage, const pocketfft_r &plan1, const pocketfft_r &plan2, + const cmav &fkernel) const + { + using T = typename Tstorage::datatype; + size_t l_in = plan1.length(), + l_out = plan2.length(), + l_min = std::min(l_in, l_out); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + plan1.exec_copyback(buf2, buf1, T0(1), true); + auto res = buf2; + { + res[0] *= fkernel(0); + size_t i; + for (i=1; 2*i t1(res[2*i-1], res[2*i]); + Cmplx t2(fkernel(2*i-1), fkernel(2*i)); + auto t3 = t1*t2; + res[2*i-1] = t3.r; + res[2*i] = t3.i; + } + if (2*i==l_min) + { + if (l_min t1(res[2*i-1], res[2*i]); + Cmplx t2(fkernel(2*i-1), fkernel(2*i)); + res[2*i-1] = (t1*t2).r*T0(2); + } + else + res[2*i-1] *= fkernel(2*i-1); + } + } + for (size_t i=l_in; i void operator() ( + const Titer &it, const cfmav> &in, const vfmav> &out, + Tstorage &storage, const pocketfft_c &plan1, const pocketfft_c &plan2, + const cmav,1> &fkernel) const + { + using T = typename Tstorage::datatype; + size_t l_in = plan1.length(), + l_out = plan2.length(), + l_min = std::min(l_in, l_out); + T *buf1=storage.transformBuf(), *buf2=storage.dataBuf(); + copy_input(it, in, buf2); + auto res = plan1.exec(buf2, buf1, T0(1), true); + auto res2 = buf2+l_in; + { + res2[0] = res[0]*fkernel(0); + size_t i; + for (i=1; 2*i DUCC0_NOINLINE void convolve_axis(const cfmav &in, + const vfmav &out, size_t axis, const cmav &kernel, size_t nthreads) + { + MR_assert(axis, T>(in, out, axis, kernel, nthreads, + ExecConv1R()); + } +template DUCC0_NOINLINE void convolve_axis(const cfmav> &in, + const vfmav> &out, size_t axis, const cmav,1> &kernel, + size_t nthreads) + { + MR_assert(axis>&>(in)); + const auto &out2(reinterpret_cast>&>(out)); + const auto &kernel2(reinterpret_cast,1>&>(kernel)); + general_convolve_axis, T>(in2, out2, axis, kernel2, nthreads, + ExecConv1C()); + } + +} // namespace detail_fft + +} // namespace ducc0 + +#endif // POCKETFFT_HDRONLY_H diff --git a/contrib/ducc0/infra/aligned_array.h b/contrib/ducc0/infra/aligned_array.h new file mode 100644 index 000000000..f2fc9835e --- /dev/null +++ b/contrib/ducc0/infra/aligned_array.h @@ -0,0 +1,175 @@ +/** \file ducc0/infra/aligned_array.h + * + * \copyright Copyright (C) 2019-2022 Max-Planck-Society + * \author Martin Reinecke + */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_ALIGNED_ARRAY_H +#define DUCC0_ALIGNED_ARRAY_H + +#include +#include + +namespace ducc0 { + +namespace detail_aligned_array { + +using namespace std; + +// std::aligned_alloc is a bit cursed ... it doesn't exist on MacOS < 10.15 +// and in musl. Let's unconditionally work around it for now. +//#if ((__cplusplus >= 201703L) && (!defined(__APPLE__))) +#define DUCC0_WORKAROUND_ALIGNED_ALLOC +//#endif + +/// Bare bones array class. +/** Mostly useful for uninitialized temporary buffers. + * \note Since this class operates on raw memory, it should only be used with + * POD types, and even then only with caution! */ +template class array_base + { + private: + T *p; + size_t sz; + + static T *ralloc(size_t num) + { + if constexpr(alignment<=alignof(max_align_t)) + { + void *res = malloc(num*sizeof(T)); + if (!res) throw bad_alloc(); + return reinterpret_cast(res); + } + else + { + if (num==0) return nullptr; +#if (!defined(DUCC0_WORKAROUND_ALIGNED_ALLOC)) + // aligned_alloc requires the allocated size to be a multiple of the + // requested alignment, so increase size if necessary + void *res = aligned_alloc(alignment,((num*sizeof(T)+alignment-1)/alignment)*alignment); + if (!res) throw bad_alloc(); +#else // portable emulation + void *ptr = malloc(num*sizeof(T)+alignment); + if (!ptr) throw bad_alloc(); + void *res = reinterpret_cast((reinterpret_cast(ptr) & ~(size_t(alignment-1))) + alignment); + (reinterpret_cast(res))[-1] = ptr; +#endif + return reinterpret_cast(res); + } + } + static void dealloc(T *ptr) + { + if constexpr(alignment<=alignof(max_align_t)) + free(ptr); + else +#if (!defined(DUCC0_WORKAROUND_ALIGNED_ALLOC)) + free(ptr); +#else + if (ptr) free((reinterpret_cast(ptr))[-1]); +#endif + } + +#undef DUCC0_WORKAROUND_ALIGNED_ALLOC + + public: + /// Creates a zero-sized array with no associated memory. + array_base() : p(nullptr), sz(0) {} + /// Creates an array with \a n entries. + /** \note Memory is not initialized! */ + array_base(size_t n) : p(ralloc(n)), sz(n) {} + array_base(const array_base &) = delete; + array_base(array_base &&other) + : p(other.p), sz(other.sz) + { other.p=nullptr; other.sz=0; } + ~array_base() { dealloc(p); } + + array_base &operator=(const array_base &) = delete; + array_base &operator=(array_base &&other) + { + swap(p, other.p); + swap(sz, other.sz); + return *this; + } + + /// If \a n is different from the current size, resizes the array to hold + /// \a n elements. + /** \note No data content is copied, the new array is uninitialized! */ + void resize(size_t n) + { + if (n==sz) return; + dealloc(p); + p = ralloc(n); + sz = n; + } + + /// Returns a writeable reference to the element at index \a idx. + T &operator[](size_t idx) { return p[idx]; } + /// Returns a read-only reference to the element at index \a idx. + const T &operator[](size_t idx) const { return p[idx]; } + + /// Returns a writeable pointer to the array data. + T *data() { return p; } + /// Returns a read-only pointer to the array data. + const T *data() const { return p; } + + /// Returns the size of the array. + size_t size() const { return sz; } + }; + +template using quick_array = array_base; +template using aligned_array = array_base; + +} + +using detail_aligned_array::aligned_array; +using detail_aligned_array::quick_array; + +} + +#endif + diff --git a/contrib/ducc0/infra/error_handling.h b/contrib/ducc0/infra/error_handling.h new file mode 100644 index 000000000..bfd4ea17a --- /dev/null +++ b/contrib/ducc0/infra/error_handling.h @@ -0,0 +1,120 @@ +/** \file ducc0/infra/error_handling.h + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_ERROR_HANDLING_H +#define DUCC0_ERROR_HANDLING_H + +#include +#include +#include "ducc0/infra/useful_macros.h" + +namespace ducc0 { + +namespace detail_error_handling { + +#if defined (__GNUC__) +#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__, __PRETTY_FUNCTION__) +#else +#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__) +#endif + +// to be replaced with std::source_location once generally available +class CodeLocation + { + private: + const char *file, *func; + int line; + + public: + CodeLocation(const char *file_, int line_, const char *func_=nullptr) + : file(file_), func(func_), line(line_) {} + + inline ::std::ostream &print(::std::ostream &os) const + { + os << "\n" << file << ": " << line; + if (func) os << " (" << func << ")"; + os << ":\n"; + return os; + } + }; + +inline ::std::ostream &operator<<(::std::ostream &os, const CodeLocation &loc) + { return loc.print(os); } + +template +void streamDump__(::std::ostream &os, Args&&... args) + { (os << ... << args); } +template +[[noreturn]] DUCC0_NOINLINE void fail__(Args&&... args) + { + ::std::ostringstream msg; \ + ::ducc0::detail_error_handling::streamDump__(msg, std::forward(args)...); \ + throw ::std::runtime_error(msg.str()); \ + } + +/// Throws a std::runtime_error containing the code location and the +/// passed arguments. +#define MR_fail(...) \ + do { \ + ::ducc0::detail_error_handling::fail__(DUCC0_ERROR_HANDLING_LOC_, "\n", ##__VA_ARGS__, "\n"); \ + } while(0) + +/// If \a cond is false, throws a std::runtime_error containing the code +/// location and the passed arguments. +#define MR_assert(cond,...) \ + do { \ + if (cond); \ + else { MR_fail("Assertion failure\n", ##__VA_ARGS__); } \ + } while(0) + +}} + +#endif diff --git a/contrib/ducc0/infra/mav.h b/contrib/ducc0/infra/mav.h new file mode 100644 index 000000000..eef71ef5d --- /dev/null +++ b/contrib/ducc0/infra/mav.h @@ -0,0 +1,1354 @@ +/*! \file ducc0/infra/mav.h + * Classes for dealing with multidimensional arrays + * + * \copyright Copyright (C) 2019-2023 Max-Planck-Society + * \author Martin Reinecke + * */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_MAV_H +#define DUCC0_MAV_H + +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/aligned_array.h" +#include "ducc0/infra/misc_utils.h" +#include "ducc0/infra/threading.h" + +namespace ducc0 { + +namespace detail_mav { + +using namespace std; + +// the next line is necessary to address some sloppy name choices in hipSYCL +using std::min, std::max; + +struct uninitialized_dummy {}; +constexpr uninitialized_dummy UNINITIALIZED; + +template class cmembuf + { + protected: + shared_ptr> ptr; + shared_ptr> rawptr; + const T *d; + + cmembuf(const T *d_, const cmembuf &other) + : ptr(other.ptr), rawptr(other.rawptr), d(d_) {} + + // externally owned data pointer + cmembuf(const T *d_) + : d(d_) {} + // share another memory buffer, but read-only + cmembuf(const cmembuf &other) + : ptr(other.ptr), rawptr(other.rawptr), d(other.d) {} + cmembuf(size_t sz) + : ptr(make_shared>(sz)), d(ptr->data()) {} +#if 1 + cmembuf(size_t sz, uninitialized_dummy) + : rawptr(make_shared>(sz)), d(rawptr->data()) {} +# else // "poison" the array with a fixed value; use for debugging + cmembuf(size_t sz, uninitialized_dummy) + : rawptr(make_shared>(sz)), d(rawptr->data()) + { for (size_t i=0; i const T &raw(I i) const + { return d[i]; } + // read access to data area + const T *data() const + { return d; } + }; + +constexpr size_t MAXIDX=~(size_t(0)); + +struct slice + { + size_t beg, end; + ptrdiff_t step; + slice() : beg(0), end(MAXIDX), step(1) {} + slice(size_t idx) : beg(idx), end(idx), step(1) {} + slice(size_t beg_, size_t end_, ptrdiff_t step_=1) + : beg(beg_), end(end_), step(step_) + { +// FIXME: add sanity checks here + } + + size_t size(size_t shp) const + { + if (beg==end) return 0; + if (step>0) return (min(shp,end)-beg+step-1)/step; + // negative step + if (end==MAXIDX) + return (beg-step)/(-step); + return (beg-end-step-1)/(-step); + } + }; + +/// Helper class containing shape and stride information of an `fmav` object +class fmav_info + { + public: + /// vector of nonnegative integers for storing the array shape + using shape_t = vector; + /// vector of integers for storing the array strides + using stride_t = vector; + + protected: + shape_t shp; + stride_t str; + size_t sz; + + static stride_t shape2stride(const shape_t &shp) + { + auto ndim = shp.size(); + // MR using the static_cast just to avoid a GCC warning. +// stride_t res(ndim); + stride_t res(static_cast(ndim)); + if (ndim==0) return res; + res[ndim-1]=1; + for (size_t i=2; i<=ndim; ++i) + res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]); + return res; + } + template ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const + { return str[dim]*ptrdiff_t(n) + getIdx(dim+1, ns...); } + ptrdiff_t getIdx(size_t dim, size_t n) const + { return str[dim]*ptrdiff_t(n); } + ptrdiff_t getIdx(size_t /*dim*/) const + { return 0; } + + public: + /// Constructs a 1D object with all extents and strides set to zero. + fmav_info() : shp(1,0), str(1,0), sz(0) {} + /// Constructs an object with the given shape and stride. + fmav_info(const shape_t &shape_, const stride_t &stride_) + : shp(shape_), str(stride_), + sz(accumulate(shp.begin(),shp.end(),size_t(1),multiplies<>())) + { + MR_assert(shp.size()==str.size(), "dimensions mismatch"); + } + /// Constructs an object with the given shape and computes the strides + /// automatically, assuming a C-contiguous memory layout. + fmav_info(const shape_t &shape_) + : fmav_info(shape_, shape2stride(shape_)) {} + void assign(const fmav_info &other) + { + shp = other.shp; + str = other.str; + sz = other.sz; + } + /// Returns the dimensionality of the object. + size_t ndim() const { return shp.size(); } + /// Returns the total number of entries in the object. + size_t size() const { return sz; } + /// Returns the shape of the object. + const shape_t &shape() const { return shp; } + /// Returns the length along dimension \a i. + size_t shape(size_t i) const { return shp[i]; } + /// Returns the strides of the object. + const stride_t &stride() const { return str; } + /// Returns the stride along dimension \a i. + const ptrdiff_t &stride(size_t i) const { return str[i]; } + /// Returns true iff the last dimension has stride 1. + /** Typically used for optimization purposes. */ + bool last_contiguous() const + { return ((ndim()==0) || (str.back()==1)); } + /** Returns true iff the object is C-contiguous, i.e. if the stride of the + * last dimension is 1, the stride for the next-to-last dimension is the + * shape of the last dimension etc. */ + bool contiguous() const + { + auto ndim = shp.size(); + ptrdiff_t stride=1; + for (size_t i=0; ishape and \a other.shape match. + bool conformable(const fmav_info &other) const + { return shp==other.shp; } + /// Returns the one-dimensional index of an entry from the given + /// multi-dimensional index tuple, taking strides into account. + template ptrdiff_t idx(Ns... ns) const + { + MR_assert(ndim()==sizeof...(ns), "incorrect number of indices"); + return getIdx(0, ns...); + } + ptrdiff_t idx(const shape_t &ns) const + { + MR_assert(ndim()==ns.size(), "incorrect number of indices"); + size_t res = 0; + for (size_t i=0; i ptrdiff_t idxval(RAiter beg, RAiter end) const + { + MR_assert(ndim()==size_t(end-beg), "incorrect number of indices"); + size_t res = 0; + for (size_t i=0; i=shp.size(), "cannot reduce dimensionality"); + stride_t newstr(shp2.size(), 0); + for (size_t i=0; i=ndim(), + "new shape smaller than original one"); + MR_assert(axpos.size()==ndim(), "bad axpos size"); + stride_t new_stride(new_shape.size(), 0); + vector used(new_shape.size(),0); + for (size_t i=0; i &slices) const + { + auto ndim = shp.size(); + shape_t nshp(ndim); + stride_t nstr(ndim); + MR_assert(slices.size()==ndim, "incorrect number of slices"); + size_t n0=0; + for (auto x:slices) if (x.beg==x.end) ++n0; + ptrdiff_t nofs=0; + nshp.resize(ndim-n0); + nstr.resize(ndim-n0); + for (size_t i=0, i2=0; i class mav_info + { + public: + /// Fixed-size array of nonnegative integers for storing the array shape + using shape_t = array; + /// Fixed-size array of integers for storing the array strides + using stride_t = array; + + protected: + shape_t shp; + stride_t str; + size_t sz; + + static stride_t shape2stride(const shape_t &shp) + { + stride_t res; + if (ndim==0) return res; + res[ndim-1]=1; + for (size_t i=2; i<=ndim; ++i) + res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]); + return res; + } + template ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const + { return str[dim]*n + getIdx(dim+1, ns...); } + ptrdiff_t getIdx(size_t dim, size_t n) const + { return str[dim]*n; } + ptrdiff_t getIdx(size_t /*dim*/) const + { return 0; } + + public: + /// Constructs an object with all extents and strides set to zero. + mav_info() : sz(0) + { + for (size_t i=0; i())) {} + /// Constructs an object with the given shape and computes the strides + /// automatically, assuming a C-contiguous memory layout. + mav_info(const shape_t &shape_) + : mav_info(shape_, shape2stride(shape_)) {} + mav_info(const fmav_info &inp) + { + MR_assert(inp.ndim()==ndim, "dimensionality mismatch"); + sz=1; + for (size_t i=0; ishape and \a other.shape match. + bool conformable(const mav_info &other) const + { return shp==other.shp; } + /// Returns true iff this->shape and \a other match. + bool conformable(const shape_t &other) const + { return shp==other; } + /// Returns the one-dimensional index of an entry from the given + /// multi-dimensional index tuple, taking strides into account. + template ptrdiff_t idx(Ns... ns) const + { + static_assert(ndim==sizeof...(ns), "incorrect number of indices"); + return getIdx(0, ns...); + } + mav_info transpose() const + { + shape_t shp2; + stride_t str2; + for (size_t i=0; i prepend_1() const + { + typename mav_info::shape_t newshp; + typename mav_info::stride_t newstr; + newshp[0] = 1; + newstr[0] = 0; + for (size_t i=0; i(newshp, newstr); + } + + protected: + template auto subdata(const vector &slices) const + { + MR_assert(slices.size()==ndim, "bad number of slices"); + array nshp; + array nstr; + + // unnecessary, but gcc warns otherwise + for (size_t i=0; i(nshp, nstr), nofs); + } + }; + +template class cfmav: public fmav_info, public cmembuf + { + protected: + using tbuf = cmembuf; + using tinfo = fmav_info; + using fmav_info::idx; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + + + protected: + cfmav(const shape_t &shp_) + : tinfo(shp_), tbuf(size()) {} + cfmav(const shape_t &shp_, uninitialized_dummy) + : tinfo(shp_), tbuf(size(), UNINITIALIZED) {} + cfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy) + : tinfo(shp_, str_), tbuf(size(), UNINITIALIZED) + { + ptrdiff_t ofs=0; + for (size_t i=0; i const T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + const T &operator()(const shape_t &ns) const + { return raw(idx(ns)); } + template const T& val(RAiter beg, RAiter end) const + { return raw(idxval(beg, end)); } + + cfmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = subdata(slices); + return cfmav(ninfo, tbuf::d+nofs, *this); + } + cfmav extend_and_broadcast(const shape_t &new_shape, const shape_t &axpos) const + { + return cfmav(fmav_info::extend_and_broadcast(new_shape, axpos), *this); + } + cfmav extend_and_broadcast(const shape_t &new_shape, size_t firstaxis) const + { + return cfmav(fmav_info::extend_and_broadcast(new_shape, firstaxis), *this); + } + cfmav transpose() const + { + return cfmav(static_cast(this)->transpose(), *static_cast(this)); + } + }; + +template cfmav subarray + (const cfmav &arr, const vector &slices) + { return arr.subarray(slices); } + +template class vfmav: public cfmav + { + protected: + using tbuf = cmembuf; + using tinfo = fmav_info; + using tinfo::shp, tinfo::str; + using fmav_info::idx; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tinfo::size, tinfo::shape, tinfo::stride; + + protected: + vfmav(const fmav_info &info, const tbuf &buf) + : cfmav(info, buf) {} + vfmav(const fmav_info &info, T *d_, const tbuf &buf) + : cfmav(info, d_, buf) {} + + public: + using tbuf::raw, tbuf::data, tinfo::ndim; + vfmav() {} + vfmav(T *d_, const fmav_info &info) + : cfmav(d_, info) {} + vfmav(T *d_, const shape_t &shp_, const stride_t &str_) + : cfmav(d_, shp_, str_) {} + vfmav(T *d_, const shape_t &shp_) + : cfmav(d_, shp_) {} + vfmav(const shape_t &shp_) + : cfmav(shp_) {} + vfmav(const shape_t &shp_, uninitialized_dummy) + : cfmav(shp_, UNINITIALIZED) {} + vfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy) + : cfmav(shp_, str_, UNINITIALIZED) + { + ptrdiff_t ofs=0; + for (size_t i=0; i(buf, shp_, str_) {} + + T *data() const + { return const_cast(tbuf::d); } + template T &raw(I i) const + { return data()[i]; } + + // no-op. Needed for template tricks. + using cfmav::to_fmav; + vfmav to_fmav() const { return *this; } + + void assign(const vfmav &other) + { + fmav_info::assign(other); + cmembuf::assign(other); + } + + using cfmav::operator(); + template T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + T &operator()(const shape_t &ns) const + { return raw(idx(ns)); } + using cfmav::val; + template T& val(RAiter beg, RAiter end) const + { return raw(idxval(beg, end)); } + + vfmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = tinfo::subdata(slices); + return vfmav(ninfo, data()+nofs, *this); + } + /** Returns a writable fmav with the specified shape. + * The strides are chosen in such a way that critical strides (multiples + * of 4096 bytes) along any dimension are avoided, by enlarging the + * allocated memory slightly if necessary. + * The array data is default-initialized. */ + static vfmav build_noncritical(const shape_t &shape) + { + auto ndim = shape.size(); + auto shape2 = noncritical_shape(shape, sizeof(T)); + vfmav tmp(shape2); + vector slc(ndim); + for (size_t i=0; i slc(ndim); + for (size_t i=0; i(this)->transpose(), *static_cast(this)); + } + }; + +template vfmav subarray + (const vfmav &arr, const vector &slices) + { return arr.subarray(slices); } + +template class cmav: public mav_info, public cmembuf + { + protected: + template friend class cmav; + template friend class vmav; + + using tinfo = mav_info; + using tbuf = cmembuf; + using tinfo::shp, tinfo::str; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable; + + protected: + cmav() {} + cmav(const shape_t &shp_, uninitialized_dummy) + : tinfo(shp_), tbuf(size(), UNINITIALIZED) {} + cmav(const shape_t &shp_) + : tinfo(shp_), tbuf(size()) {} + cmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_) + : tinfo(shp_, str_), tbuf(buf) {} + cmav(const tinfo &info, const T *d_, const tbuf &buf) + : tinfo(info), tbuf(d_, buf) {} + cmav(const tinfo &info, const tbuf &buf) + : tinfo(info), tbuf(buf) {} + + public: + cmav(const T *d_, const shape_t &shp_, const stride_t &str_) + : tinfo(shp_, str_), tbuf(d_) {} + cmav(const T *d_, const shape_t &shp_) + : tinfo(shp_), tbuf(d_) {} + cmav(const cfmav &inp) + : tinfo(inp), tbuf(inp) {} + void assign(const cmav &other) + { + mav_info::assign(other); + cmembuf::assign(other); + } + operator cfmav() const + { + return cfmav(*this, {shp.begin(), shp.end()}, {str.begin(), str.end()}); + } + // Needed for template tricks. + cfmav to_fmav() const { return operator cfmav(); } + + template const T &operator()(Ns... ns) const + { return raw(idx(ns...)); } + template cmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = tinfo::template subdata (slices); + return cmav (ninfo, tbuf::d+nofs, *this); + } + + static cmav build_uniform(const shape_t &shape, const T &value) + { + // Don't do this at home! + shape_t tshp; + tshp.fill(1); + cmav tmp(tshp); + const_cast(tmp.raw(0)) = value; + stride_t nstr; + nstr.fill(0); + return cmav(tmp, shape, nstr); + } + cmav transpose() const + { + return cmav(static_cast(this)->transpose(), *static_cast(this)); + } + cmav prepend_1() const + { + return cmav(static_cast(this)->prepend_1(), *static_cast(this)); + } + template cmav reinterpret + (const typename cmav::shape_t &newshp, + const typename cmav::stride_t &newstr) const + { + return cmav(*static_cast(this), newshp, newstr); + } + }; +template cmav subarray + (const cmav &arr, const vector &slices) + { return arr.template subarray(slices); } + +template class vmav: public cmav + { + protected: + template friend class vmav; + + using parent = cmav; + using tinfo = mav_info; + using tbuf = cmembuf; + using tinfo::shp, tinfo::str; + + public: + using typename tinfo::shape_t; + using typename tinfo::stride_t; + using tbuf::raw, tbuf::data; + using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable; + + protected: + vmav(const tinfo &info, T *d_, const tbuf &buf) + : parent(info, d_, buf) {} + vmav(const tinfo &info, const tbuf &buf) + : parent(info, buf) {} + vmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_) + : parent(buf, shp_, str_){} + + public: + vmav() {} + vmav(T *d_, const shape_t &shp_, const stride_t &str_) + : parent(d_, shp_, str_) {} + vmav(T *d_, const shape_t &shp_) + : parent(d_, shp_) {} + vmav(const shape_t &shp_) + : parent(shp_) {} + vmav(const shape_t &shp_, uninitialized_dummy) + : parent(shp_, UNINITIALIZED) {} + vmav(const vfmav &inp) + : parent(inp) {} + + void assign(vmav &other) + { parent::assign(other); } + void dealloc() + { + vmav empty; + assign(empty); + } + operator vfmav() const + { + return vfmav(*const_cast(static_cast(this)), {shp.begin(), shp.end()}, {str.begin(), str.end()}); + } + // Needed for template tricks. + using cmav::to_fmav; + vfmav to_fmav() const { return operator vfmav(); } + + using parent::operator(); + template T &operator()(Ns... ns) const + { return const_cast(parent::operator()(ns...)); } + + template vmav subarray(const vector &slices) const + { + auto [ninfo, nofs] = tinfo::template subdata (slices); + return vmav (ninfo, data()+nofs, *this); + } + + T *data() const + { return const_cast(tbuf::d); } + // read access to element #i + template T &raw(I i) const + { return data()[i]; } + + static vmav build_empty() + { + shape_t nshp; + nshp.fill(0); + return vmav(static_cast(nullptr), nshp); + } + + static vmav build_noncritical(const shape_t &shape) + { + auto shape2 = noncritical_shape(shape, sizeof(T)); + vmav tmp(shape2); + vector slc(ndim); + for (size_t i=0; i(slc); + } + static vmav build_noncritical(const shape_t &shape, uninitialized_dummy) + { + if (ndim<=1) return vmav(shape, UNINITIALIZED); + auto shape2 = noncritical_shape(shape, sizeof(T)); + vmav tmp(shape2, UNINITIALIZED); + vector slc(ndim); + for (size_t i=0; i(slc); + } + vmav transpose() const + { + return vmav(static_cast(this)->transpose(), *static_cast(this)); + } + vmav prepend_1() const + { + return vmav(static_cast(this)->prepend_1(), *static_cast(this)); + } + template vmav reinterpret + (const typename vmav::shape_t &newshp, + const typename vmav::stride_t &newstr) const + { + return vmav(*static_cast(this), newshp, newstr); + } + }; + +template vmav subarray + (const vmav &arr, const vector &slices) + { return arr.template subarray(slices); } + +// various operations involving fmav objects of the same shape -- experimental + +DUCC0_NOINLINE tuple, size_t, size_t> + multiprep(const vector &info, const vector &tsizes); +DUCC0_NOINLINE tuple> + multiprep(const vector &info); + +template constexpr inline size_t tuplelike_size() + { return tuple_size_v>; } + +template +inline void call_with_tuple_impl(Func &&func, const Ttuple& tuple, + index_sequence) + { func(std::forward::type>(get(tuple))...); } +template inline void call_with_tuple + (Func &&func, Ttuple &&tuple) + { + call_with_tuple_impl(std::forward(func), tuple, + make_index_sequence()>()); + } +template +inline void call_with_tuple2_impl(Func &&func, const Ttuple& tuple, + index_sequence) + { func(get(tuple)...); } +template inline void call_with_tuple2 + (Func &&func, Ttuple &&tuple) + { + call_with_tuple2_impl(std::forward(func), tuple, + make_index_sequence()>()); + } + +template +inline auto tuple_transform_impl(tuple const& inputs, Func &&func, + index_sequence) + { return tuple...>{func(get(inputs))...}; } +template +inline auto tuple_transform(tuple const& inputs, Func &&func) + { + return tuple_transform_impl(inputs, std::forward(func), + make_index_sequence{}); + } +template +inline void tuple_for_each_impl(tuple &tpl, Func &&func, + index_sequence) + { (func(get(tpl)), ...); } +template +inline void tuple_for_each(tuple &tpl, Func &&func) + { + tuple_for_each_impl(tpl, std::forward(func), make_index_sequence{}); + } +template +inline void tuple_for_each_impl(const tuple &tpl, Func &&func, + index_sequence) + { (func(get(tpl)), ...); } +template +inline void tuple_for_each(const tuple &tpl, Func &&func) + { + tuple_for_each_impl(tpl, std::forward(func), make_index_sequence{}); + } + +template +inline auto tuple_transform_idx_impl(const tuple &inputs, + Func &&func, index_sequence) + { + return tuple...> + {func(get(inputs), Is)...}; + } + +template +inline auto tuple_transform_idx(const tuple &inputs, Func &&func) + { + return tuple_transform_idx_impl(inputs, std::forward(func), + make_index_sequence{}); + } +template +inline void tuple_for_each_idx_impl(tuple &tpl, Func &&func, + index_sequence) + { (func(get(tpl), Is), ...); } +template +inline void tuple_for_each_idx(tuple &tpl, Func &&func) + { + tuple_for_each_idx_impl(tpl, std::forward(func), make_index_sequence{}); + } + +template inline auto to_ref (const Ttuple &tuple) + { + return tuple_transform(tuple,[](auto &&ptr) -> typename std::add_lvalue_reference_t{ return *ptr; }); + } + +template inline Ttuple update_pointers (const Ttuple &ptrs, + const vector> &str, size_t idim, size_t i) + { + return tuple_transform_idx(ptrs, [i,idim,&str](auto &&ptr, size_t idx) + { return ptr + i*str[idx][idim]; }); + } + +template inline Ttuple update_pointers_contiguous (const Ttuple &ptrs, + size_t i) + { + return tuple_transform(ptrs, [i](auto &&ptr) { return ptr+i; }); + } +template inline void advance_contiguous (Ttuple &ptrs) + { tuple_for_each(ptrs, [](auto &&ptr) { ++ptr; }); } +template inline void advance (Ttuple &ptrs, + const vector> &str, size_t idim) + { + tuple_for_each_idx(ptrs, [idim,&str](auto &&ptr, size_t idx) + { ptr += str[idx][idim]; }); + } +template inline void advance_by_n (Ttuple &ptrs, + const vector> &str, size_t idim, size_t n) + { + tuple_for_each_idx(ptrs, [idim,n,&str](auto &&ptr, size_t idx) + { ptr += n*str[idx][idim]; }); + } + +template + DUCC0_NOINLINE void applyHelper_block(size_t idim, const vector &shp, + const vector> &str, size_t bsi, size_t bsj, + const Ttuple &ptrs, Func &&func) + { + auto leni=shp[idim], lenj=shp[idim+1]; + size_t nbi = (leni+bsi-1)/bsi; + size_t nbj = (lenj+bsj-1)/bsj; + for (size_t bi=0; bi + DUCC0_NOINLINE void applyHelper(size_t idim, const vector &shp, + const vector> &str, size_t block0, size_t block1, + const Ttuple &ptrs, Func &&func, bool last_contiguous) + { + auto len = shp[idim]; + if ((idim+2==shp.size()) && (block0!=0)) // we should do blocking + applyHelper_block(idim, shp, str, block0, block1, ptrs, func); + else if (idim+1 + inline void applyHelper(const vector &shp, + const vector> &str, size_t block0, size_t block1, + const Ttuple &ptrs, Func &&func, size_t nthreads, bool last_contiguous) + { + if (shp.size()==0) + call_with_tuple(std::forward(func), to_ref(ptrs)); + else if (nthreads==1) + applyHelper(0, shp, str, block0, block1, ptrs, std::forward(func), last_contiguous); + else + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + auto locptrs = update_pointers(ptrs, str, 0, lo); + auto locshp(shp); + locshp[0] = hi-lo; + applyHelper(0, locshp, str, block0, block1, locptrs, func, last_contiguous); + }); + } + +template + void mav_apply(Func &&func, int nthreads, Targs... args) + { + vector infos; + (infos.push_back(args), ...); + vector tsizes; + (tsizes.push_back(sizeof(args.data()[0])), ...); + auto [shp, str, block0, block1] = multiprep(infos, tsizes); + bool last_contiguous = true; + if (shp.size()>0) + for (const auto &s:str) + last_contiguous &= (s.back()==1); + + auto ptrs = tuple_transform(forward_as_tuple(args...), + [](auto &&arg){return arg.data();}); + applyHelper(shp, str, block0, block1, ptrs, std::forward(func), nthreads, last_contiguous); + } + +DUCC0_NOINLINE tuple> + multiprep_noopt(const vector &info); + +template +inline void call_with_tuple_arg_impl(Func &&func, Arg &&arg, const Ttuple& tuple, + index_sequence) + { func(std::forward::type>(get(tuple))..., arg); } +template inline void call_with_tuple_arg + (Func &&func, Arg &&arg, Ttuple &&tuple) + { + call_with_tuple_arg_impl(std::forward(func), arg, tuple, + make_index_sequence()>()); + } +template + DUCC0_NOINLINE void applyHelper_with_index(size_t idim, const vector &shp, + const vector> &str, const Ttuple &ptrs, Func &&func, + vector &index) + { + auto len = shp[idim]; + if (idim+1 &>(index), to_ref(locptrs)); + index[idim] = idxbak; + } + } +template + inline void applyHelper_with_index(const vector &shp, + const vector> &str, const Ttuple &ptrs, Func &&func, + size_t nthreads, vector &index) + { + if (shp.size()==0) + call_with_tuple_arg(std::forward(func), const_cast &>(index), to_ref(ptrs)); + else if (nthreads==1) + applyHelper_with_index(0, shp, str, ptrs, std::forward(func), index); + else + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + auto locptrs = update_pointers(ptrs, str, 0, lo); + auto locshp(shp); + locshp[0] = hi-lo; + auto locidx(index); + locidx[0]=lo; + applyHelper_with_index(0, locshp, str, locptrs, func, locidx); + }); + } +template + void mav_apply_with_index(Func &&func, int nthreads, Targs... args) + { + vector infos; + (infos.push_back(args), ...); + auto [shp, str] = multiprep_noopt(infos); + vector index(shp.size(), 0); + + auto ptrs = tuple_transform(forward_as_tuple(args...), + [](auto &&arg){return arg.data();}); + applyHelper_with_index(shp, str, ptrs, std::forward(func), nthreads, index); + } + + +template class mavref + { + private: + const mav_info &info; + T *d; + + public: + using shape_t = typename mav_info::shape_t; + using stride_t = typename mav_info::stride_t; + mavref(const mav_info &info_, T *d_) : info(info_), d(d_) {} + template T &operator()(Ns... ns) const + { return d[info.idx(ns...)]; } + /// Returns the total number of entries in the object. + size_t size() const { return info.size(); } + /// Returns the shape of the object. + const shape_t &shape() const { return info.shape(); } + /// Returns the length along dimension \a i. + size_t shape(size_t i) const { return info.shape(i); } + /// Returns the strides of the object. + const stride_t &stride() const { return info.stride(); } + /// Returns the stride along dimension \a i. + const ptrdiff_t &stride(size_t i) const { return info.stride(i); } + /// Returns true iff the last dimension has stride 1. + /** Typically used for optimization purposes. */ + bool last_contiguous() const + { return info.last_contiguous(); } + /** Returns true iff the object is C-contiguous, i.e. if the stride of the + * last dimension is 1, the stride for the next-to-last dimension is the + * shape of the last dimension etc. */ + bool contiguous() const + { return info.contiguous(); } + /// Returns true iff this->shape and \a other.shape match. + bool conformable(const mavref &other) const + { return shape()==other.shape(); } + }; + +template + mavref make_mavref(const mav_info &info_, T *d_) + { return mavref(info_, d_); } + +template +inline auto tuple_transform2_impl(const tuple &i1, const tuple &i2, + Func &&func, index_sequence) + { return tuple...>{func(get(i1),get(i2))...}; } +template +inline auto tuple_transform2(const tuple &i1, const tuple &i2, + Func &&func) + { + return tuple_transform2_impl(i1, i2, std::forward(func), + make_index_sequence{}); + } +template + auto make_mavrefs(const Tptrs &ptrs, const Tinfos &infos) + { + return tuple_transform2(ptrs, infos, [](auto &&ptr, auto &&info) + { return make_mavref(info, ptr); }); + } + +template auto make_infos(const fmav_info &info) + { + if constexpr(ndim>0) + MR_assert(ndim<=info.ndim(), "bad dimensionality"); + auto iterdim = info.ndim()-ndim; + fmav_info fout({info.shape().begin(),info.shape().begin()+iterdim}, + {info.stride().begin(),info.stride().begin()+iterdim}); + + typename mav_info::shape_t shp; + typename mav_info::stride_t str; + if constexpr (ndim>0) // just to silence compiler warnings + for (size_t i=0; i iout(shp, str); + return make_tuple(fout, iout); + } + +template + DUCC0_NOINLINE void flexible_mav_applyHelper(size_t idim, const vector &shp, + const vector> &str, const Tptrs &ptrs, + const Tinfos &infos, Func &&func) + { + auto len = shp[idim]; + auto locptrs(ptrs); + if (idim+1 + DUCC0_NOINLINE void flexible_mav_applyHelper(const vector &shp, + const vector> &str, const Tptrs &ptrs, + const Tinfos &infos, Func &&func, size_t nthreads) + { + if (shp.size()==0) + call_with_tuple2(func, make_mavrefs(ptrs, infos)); + else if (nthreads==1) + flexible_mav_applyHelper(0, shp, str, ptrs, infos, std::forward(func)); + else + execParallel(shp[0], nthreads, [&](size_t lo, size_t hi) + { + auto locptrs = update_pointers(ptrs, str, 0, lo); + auto locshp(shp); + locshp[0] = hi-lo; + flexible_mav_applyHelper(0, locshp, str, locptrs, infos, func); + }); + } + +template struct Xdim { static constexpr size_t dim=ndim; }; + +template + void xflexible_mav_apply(const Ttuple &tuple, const Tdim &dim, Func &&func, size_t nthreads) + { + auto fullinfos = tuple_transform2(tuple, dim, [](const auto &arg, const auto &dim) + { return make_infos::dim>(fmav_info(arg)); }); + vector iter_infos; + tuple_for_each(fullinfos,[&iter_infos](const auto &entry){iter_infos.push_back(get<0>(entry));}); + auto [shp, str] = multiprep(iter_infos); + + auto infos2 = tuple_transform(fullinfos, [](const auto &arg) + { return get<1>(arg); }); + auto ptrs = tuple_transform(tuple, [](auto &&arg){return arg.data();}); + flexible_mav_applyHelper(shp, str, ptrs, infos2, std::forward(func), nthreads); + } + +template + void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0) + { + xflexible_mav_apply(forward_as_tuple(m0), + forward_as_tuple(Xdim()), + std::forward(func), nthreads); + } + +template + void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0, T1 &&m1) + { + xflexible_mav_apply(forward_as_tuple(m0, m1), + forward_as_tuple(Xdim(), Xdim()), + std::forward(func), nthreads); + } + +template + void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0, T1 &&m1, T2 &&m2) + { + xflexible_mav_apply(forward_as_tuple(m0, m1, m2), + forward_as_tuple(Xdim(), Xdim(), Xdim()), + std::forward(func), nthreads); + } + +} + +using detail_mav::UNINITIALIZED; +using detail_mav::fmav_info; +using detail_mav::mav_info; +using detail_mav::slice; +using detail_mav::MAXIDX; +using detail_mav::cfmav; +using detail_mav::vfmav; +using detail_mav::cmav; +using detail_mav::vmav; +using detail_mav::subarray; +using detail_mav::mav_apply; +using detail_mav::mav_apply_with_index; +using detail_mav::flexible_mav_apply; +} + +#endif diff --git a/contrib/ducc0/infra/misc_utils.h b/contrib/ducc0/infra/misc_utils.h new file mode 100644 index 000000000..77d3019fa --- /dev/null +++ b/contrib/ducc0/infra/misc_utils.h @@ -0,0 +1,127 @@ +/* Copyright (C) 2019-2021 Max-Planck-Society + Author: Martin Reinecke */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_MISC_UTILS_H +#define DUCC0_MISC_UTILS_H + +#include +#include +#ifdef __GLIBC__ +#include +#include +#include +#endif + +namespace ducc0 { + +namespace detail_misc_utils { + +using namespace std; + +template auto calcShare(size_t nshares, size_t myshare, + const T &begin, const T &end) + { + auto nwork = end-begin; + auto nbase = nwork/nshares; + auto additional = nwork%nshares; + auto lo = begin + (myshare*nbase + ((myshare auto calcShare(size_t nshares, size_t myshare, const T &end) + { return calcShare(nshares, myshare, T(0), end); } + +template shp noncritical_shape(const shp &in, size_t elemsz) + { + constexpr size_t critstride = 4096; // must be a power of 2 + auto ndim = in.size(); + shp res(in); + size_t stride = elemsz; + for (size_t i=0, xi=ndim-1; i+1) +#include +#include +#include +#include +#include + +namespace ducc0 { + +namespace detail_simd { + +namespace stdx=std::experimental; +using stdx::native_simd; + +template struct simd_select + { using type = stdx::simd>; }; + +using stdx::element_aligned_tag; +template constexpr inline bool vectorizable = native_simd::size()>1; + +template constexpr bool simd_exists_h() + { + if constexpr (N>1) + if constexpr (vectorizable) + if constexpr (!std::is_same_v>, stdx::fixed_size_simd>) + return true; + return false; + } +template constexpr inline bool simd_exists = simd_exists_h(); + +template inline stdx::simd apply(stdx::simd in, Func func) + { + stdx::simd res; + for (size_t i=0; i inline stdx::simd sin(stdx::simd in) + { return apply(in,[](T v){return sin(v);}); } +template inline stdx::simd cos(stdx::simd in) + { return apply(in,[](T v){return cos(v);}); } + +} + +using detail_simd::element_aligned_tag; +using detail_simd::native_simd; +using detail_simd::simd_select; +using detail_simd::simd_exists; +using detail_simd::vectorizable; + +} + +#else + +// only enable SIMD support for gcc>=5.0 and clang>=5.0 +#ifndef DUCC0_NO_SIMD +#define DUCC0_NO_SIMD +#if defined(__clang__) +// AppleClang has their own version numbering +#ifdef __apple_build_version__ +# if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1) +# undef DUCC0_NO_SIMD +# endif +#elif __clang_major__ >= 5 +# undef DUCC0_NO_SIMD +#endif +#elif defined(__GNUC__) +#if __GNUC__>=5 +#undef DUCC0_NO_SIMD +#endif +#endif +#endif + +#include +#include +#include + +#ifndef DUCC0_NO_SIMD +#if defined(__SSE2__) // we are on an x86 platform and we have vector types +#include +#endif + +#if defined(__aarch64__) // let's check for SVE and Neon +#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) +#if __ARM_FEATURE_SVE_BITS>0 +// OK, we can use SVE +#define DUCC0_USE_SVE +#include +#endif +#endif +#ifndef DUCC0_USE_SVE +// see if we can use Neon +#if defined(__ARM_NEON) +#define DUCC0_USE_NEON +#include +#endif +#endif +#endif + +#endif + +namespace ducc0 { + +namespace detail_simd { + +/// true iff SIMD support is provided for \a T. +template constexpr inline bool vectorizable = false; +#if (!defined(DUCC0_NO_SIMD)) +#if defined(__SSE2__) || defined (DUCC0_USE_SVE) || defined (DUCC0_USE_NEON) +template<> constexpr inline bool vectorizable = true; +template<> constexpr inline bool vectorizable = true; +#endif +#endif + +/// true iff a SIMD type with vector length \a len exists for \a T. +template constexpr inline bool simd_exists = false; + +template constexpr size_t vectorlen + = vectorizable ? reglen/sizeof(T) : 1; + +template class helper_; +template struct vmask_ + { + private: + using hlp = helper_; + using Tm = typename hlp::Tm; + Tm v; + + public: +#if defined(_MSC_VER) + vmask_() {} + vmask_(const vmask_ &other) : v(other.v) {} + vmask_ &operator=(const vmask_ &other) + { v = other.v; return *this; } +#else + vmask_() = default; + vmask_(const vmask_ &other) = default; + vmask_ &operator=(const vmask_ &other) = default; +#endif + vmask_(Tm v_): v(v_) {} + operator Tm() const { return v; } + bool none() const { return hlp::mask_none(v); } + bool any() const { return hlp::mask_any(v); } + bool all() const { return hlp::mask_all(v); } + vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); } + vmask_ &operator&= (const vmask_ &other) { v=hlp::mask_and(v,other.v); return *this; } + vmask_ operator| (const vmask_ &other) const { return hlp::mask_or(v,other.v); } + vmask_ &operator|= (const vmask_ &other) { v=hlp::mask_or(v,other.v); return *this; } + }; +struct element_aligned_tag {}; +template class vtp + { + private: + using hlp = helper_; + + public: + using value_type = T; + using Tv = typename hlp::Tv; + using Tm = vmask_; + static constexpr size_t size() { return len; } + + private: + Tv v; + + public: +#if defined(_MSC_VER) + vtp() {} + vtp(const vtp &other): v(other.v) {} + vtp &operator=(const vtp &other) + { v=other.v; return *this; } +#else + vtp() = default; + vtp(const vtp &other) = default; + vtp &operator=(const vtp &other) = default; +#endif + vtp(T other): vtp(hlp::from_scalar(other)) {} + vtp(const Tv &other) : v(other) {} + vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; } + operator Tv() const { return v; } + + vtp(const T *ptr, element_aligned_tag) : v(hlp::loadu(ptr)) {} + void copy_to(T *ptr, element_aligned_tag) const { hlp::storeu(ptr, v); } + + vtp operator-() const { return vtp(-v); } + vtp operator+(vtp other) const { return vtp(v+other.v); } + vtp operator-(vtp other) const { return vtp(v-other.v); } + vtp operator*(vtp other) const { return vtp(v*other.v); } + vtp operator/(vtp other) const { return vtp(v/other.v); } + vtp &operator+=(vtp other) { v+=other.v; return *this; } + vtp &operator-=(vtp other) { v-=other.v; return *this; } + vtp &operator*=(vtp other) { v*=other.v; return *this; } + vtp &operator/=(vtp other) { v/=other.v; return *this; } + vtp abs() const { return hlp::abs(v); } + inline vtp sqrt() const + { return hlp::sqrt(v); } + vtp max(const vtp &other) const + { return hlp::max(v, other.v); } + vtp min(const vtp &other) const + { return hlp::min(v, other.v); } + Tm operator>(const vtp &other) const + { return hlp::gt(v, other.v); } + Tm operator>=(const vtp &other) const + { return hlp::ge(v, other.v); } + Tm operator<(const vtp &other) const + { return hlp::lt(v, other.v); } + Tm operator<=(const vtp &other) const + { return hlp::le(v, other.v); } + Tm operator==(const vtp &other) const + { return hlp::eq(v, other.v); } + Tm operator!=(const vtp &other) const + { return hlp::ne(v, other.v); } + static vtp blend(Tm mask, const vtp &a, const vtp &b) + { return hlp::blend(mask, a, b); } + + class reference + { + private: + vtp &v; + size_t i; + public: + reference (vtp &v_, size_t i_) + : v(v_), i(i_) {} + reference &operator= (T other) + { v.v[i] = other; return *this; } + reference &operator*= (T other) + { v.v[i] *= other; return *this; } + operator T() const { return v.v[i]; } + }; + + void Set(size_t i, T val) { v[i] = val; } + reference operator[](size_t i) { return reference(*this, i); } + T operator[](size_t i) const { return v[i]; } + + class where_expr + { + private: + vtp &v; + Tm m; + + public: + where_expr (Tm m_, vtp &v_) + : v(v_), m(m_) {} + where_expr &operator= (const vtp &other) + { v=hlp::blend(m, other.v, v.v); return *this; } + where_expr &operator*= (const vtp &other) + { v=hlp::blend(m, v.v*other.v, v.v); return *this; } + where_expr &operator+= (const vtp &other) + { v=hlp::blend(m, v.v+other.v, v.v); return *this; } + where_expr &operator-= (const vtp &other) + { v=hlp::blend(m, v.v-other.v, v.v); return *this; } + }; + }; +template inline vtp abs(vtp v) { return v.abs(); } +template typename vtp::where_expr where(typename vtp::Tm m, vtp &v) + { return typename vtp::where_expr(m, v); } +template vtp operator*(T0 a, vtp b) + { return b*a; } +template vtp operator+(T a, vtp b) + { return b+a; } +template vtp operator-(T a, vtp b) + { return vtp(a) - b; } +template vtp max(vtp a, vtp b) + { return a.max(b); } +template vtp min(vtp a, vtp b) + { return a.min(b); } +template vtp sqrt(vtp v) + { return v.sqrt(); } +template inline bool none_of(const vmask_ &mask) + { return mask.none(); } +template inline bool any_of(const vmask_ &mask) + { return mask.any(); } +template inline bool all_of(const vmask_ &mask) + { return mask.all(); } +template inline vtp blend (const vmask_ &mask, const vtp &a, const vtp &b) + { return vtp::blend(mask, a, b); } +template T reduce(const vtp &v, Op op) + { + T res=v[0]; + for (size_t i=1; i vtp apply(vtp in, Func func) + { + vtp res; + for (size_t i=0; i class pseudoscalar + { + private: + T v; + + public: +#if defined(_MSC_VER) + pseudoscalar() {} + pseudoscalar(const pseudoscalar &other) : v(other.v) {} + pseudoscalar & operator=(const pseudoscalar &other) + { v=other.v; return *this; } +#else + pseudoscalar() = default; + pseudoscalar(const pseudoscalar &other) = default; + pseudoscalar & operator=(const pseudoscalar &other) = default; +#endif + pseudoscalar(T v_):v(v_) {} + pseudoscalar operator-() const { return pseudoscalar(-v); } + pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); } + pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); } + pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); } + pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); } + pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; } + pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; } + pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; } + pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; } + + pseudoscalar abs() const { return std::abs(v); } + inline pseudoscalar sqrt() const { return std::sqrt(v); } + pseudoscalar max(const pseudoscalar &other) const + { return std::max(v, other.v); } + pseudoscalar min(const pseudoscalar &other) const + { return std::min(v, other.v); } + + bool operator>(const pseudoscalar &other) const + { return v>other.v; } + bool operator>=(const pseudoscalar &other) const + { return v>=other.v; } + bool operator<(const pseudoscalar &other) const + { return v class helper_ + { + private: + static constexpr size_t len = 1; + public: + using Tv = pseudoscalar; + using Tm = bool; + + static Tv loadu(const T *ptr) { return *ptr; } + static void storeu(T *ptr, Tv v) { *ptr = v[0]; } + + static Tv from_scalar(T v) { return v; } + static Tv abs(Tv v) { return v.abs(); } + static Tv max(Tv v1, Tv v2) { return v1.max(v2); } + static Tv min(Tv v1, Tv v2) { return v1.min(v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; } + static Tv sqrt(Tv v) { return v.sqrt(); } + static Tm gt (Tv v1, Tv v2) { return v1>v2; } + static Tm ge (Tv v1, Tv v2) { return v1>=v2; } + static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 8; + public: + using Tv = __m512d; + using Tm = __mmask8; + + static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm512_set1_pd(v); } + static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); } + static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm512_min_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); } + static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); } + static Tm le (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LE_OQ); } + static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_EQ_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } + static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } + static bool mask_none(Tm v) { return v==0; } + static bool mask_any(Tm v) { return v!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 16; + public: + using Tv = __m512; + using Tm = __mmask16; + + static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm512_set1_ps(v); } + static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); } + static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm512_min_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); } + static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); } + static Tm le (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LE_OQ); } + static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_EQ_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return v1&v2; } + static Tm mask_or (Tm v1, Tm v2) { return v1|v2; } + static bool mask_none(Tm v) { return v==0; } + static bool mask_any(Tm v) { return v!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = Tm((size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 4; + public: + using Tv = __m256d; + using Tm = __m256d; + + static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm256_set1_pd(v); } + static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm256_min_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); } + static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); } + static Tm le (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LE_OQ); } + static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_EQ_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_pd(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 8; + public: + using Tv = __m256; + using Tm = __m256; + + static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm256_set1_ps(v); } + static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm256_min_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); } + static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); } + static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); } + static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); } + static Tm le (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LE_OQ); } + static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_EQ_OQ); } + static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); } + static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_ps(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 2; + public: + using Tv = __m128d; + using Tm = __m128d; + + static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); } + static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); } + + static Tv from_scalar(T v) { return _mm_set1_pd(v); } + static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm_min_pd(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { +#if defined(__SSE4_1__) + return _mm_blendv_pd(v2,v1,m); +#else + return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2)); +#endif + } + static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); } + static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); } + static Tm le (Tv v1, Tv v2) { return _mm_cmple_pd(v1,v2); } + static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_pd(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); } + static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm_or_pd(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 4; + public: + using Tv = __m128; + using Tm = __m128; + + static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); } + static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); } + + static Tv from_scalar(T v) { return _mm_set1_ps(v); } + static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); } + static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); } + static Tv min(Tv v1, Tv v2) { return _mm_min_ps(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { +#if defined(__SSE4_1__) + return _mm_blendv_ps(v2,v1,m); +#else + return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2)); +#endif + } + static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); } + static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); } + static Tm le (Tv v1, Tv v2) { return _mm_cmple_ps(v1,v2); } + static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_ps(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); } + static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return _mm_or_ps(v1,v2); } + static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< class gnuvec_helper + { + public: + using Tv __attribute__ ((vector_size (len*sizeof(T)))) = T; + using Tm = decltype(Tv()v2; } + static Tm ge (Tv v1, Tv v2) { return v1>=v2; } + static Tm lt (Tv v1, Tv v2) { return v1 constexpr inline bool simd_exists = true; +template<> class helper_: public gnuvec_helper {}; +template<> constexpr inline bool simd_exists = true; +template<> class helper_: public gnuvec_helper {}; +#endif + +#if defined(DUCC0_USE_NEON) +template<> constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = double; + static constexpr size_t len = 2; + public: + using Tv = float64x2_t; + using Tm = uint64x2_t; + + static Tv loadu(const T *ptr) { return vld1q_f64(ptr); } + static void storeu(T *ptr, Tv v) { vst1q_f64(ptr, v); } + + static Tv from_scalar(T v) { return vdupq_n_f64(v); } + static Tv abs(Tv v) { return vabsq_f64(v); } + static Tv max(Tv v1, Tv v2) { return vmaxq_f64(v1, v2); } + static Tv min(Tv v1, Tv v2) { return vminq_f64(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) + { return vbslq_f64(m, v1, v2); } + static Tv sqrt(Tv v) { return vsqrtq_f64(v); } + static Tm gt (Tv v1, Tv v2) { return vcgtq_f64(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return vcgeq_f64(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return vcltq_f64(v1,v2); } + static Tm le (Tv v1, Tv v2) { return vcleq_f64(v1,v2); } + static Tm eq (Tv v1, Tv v2) { return vceqq_f64(v1,v2); } + static Tm ne (Tv v1, Tv v2) + { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(v1,v2)))); } + static Tm mask_and (Tm v1, Tm v2) { return vandq_u64(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return vorrq_u64(v1,v2); } + static size_t maskbits(Tm v) + { + auto high_bits = vshrq_n_u64(v, 63); + return vgetq_lane_u64(high_bits, 0) | ((vgetq_lane_u64(high_bits, 1)<<1)); + } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< constexpr inline bool simd_exists = true; +template<> class helper_ + { + private: + using T = float; + static constexpr size_t len = 4; + public: + using Tv = float32x4_t; + using Tm = uint32x4_t; + + static Tv loadu(const T *ptr) { return vld1q_f32(ptr); } + static void storeu(T *ptr, Tv v) { vst1q_f32(ptr, v); } + + static Tv from_scalar(T v) { return vdupq_n_f32(v); } + static Tv abs(Tv v) { return vabsq_f32(v); } + static Tv max(Tv v1, Tv v2) { return vmaxq_f32(v1, v2); } + static Tv min(Tv v1, Tv v2) { return vminq_f32(v1, v2); } + static Tv blend(Tm m, Tv v1, Tv v2) { return vbslq_f32(m, v1, v2); } + static Tv sqrt(Tv v) { return vsqrtq_f32(v); } + static Tm gt (Tv v1, Tv v2) { return vcgtq_f32(v1,v2); } + static Tm ge (Tv v1, Tv v2) { return vcgeq_f32(v1,v2); } + static Tm lt (Tv v1, Tv v2) { return vcltq_f32(v1,v2); } + static Tm le (Tv v1, Tv v2) { return vcleq_f32(v1,v2); } + static Tm eq (Tv v1, Tv v2) { return vceqq_f32(v1,v2); } + static Tm ne (Tv v1, Tv v2) { return vmvnq_u32(vceqq_f32(v1,v2)); } + static Tm mask_and (Tm v1, Tm v2) { return vandq_u32(v1,v2); } + static Tm mask_or (Tm v1, Tm v2) { return vorrq_u32(v1,v2); } + static size_t maskbits(Tm v) + { + static constexpr int32x4_t shift = {0, 1, 2, 3}; + auto tmp = vshrq_n_u32(v, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); + } + static bool mask_none(Tm v) { return maskbits(v)==0; } + static bool mask_any(Tm v) { return maskbits(v)!=0; } + static bool mask_all(Tm v) + { + static constexpr auto fullmask = (size_t(1)< using native_simd = vtp>; +#elif defined(__AVX__) +template using native_simd = vtp>; +#elif defined(__SSE2__) +template using native_simd = vtp>; +#elif defined(DUCC0_USE_SVE) +template using native_simd = vtp>; +#elif defined(DUCC0_USE_NEON) +template using native_simd = vtp>; +#else +template using native_simd = vtp; +#endif + +#else // DUCC0_NO_SIMD is defined +/// The SIMD type for \a T with the largest vector length on this platform. +template using native_simd = vtp; +#endif +/// Provides a SIMD type for \a T with vector length \a len, if it exists. +template struct simd_select + { using type = vtp; }; +template inline vtp sin(vtp in) + { return apply(in,[](T v){return std::sin(v);}); } +template inline vtp cos(vtp in) + { return apply(in,[](T v){return std::cos(v);}); } + +} + +using detail_simd::element_aligned_tag; +using detail_simd::native_simd; +using detail_simd::simd_select; +using detail_simd::simd_exists; +using detail_simd::vectorizable; + +} +#endif +#endif diff --git a/contrib/ducc0/infra/string_utils.cc b/contrib/ducc0/infra/string_utils.cc new file mode 100644 index 000000000..652d91b3b --- /dev/null +++ b/contrib/ducc0/infra/string_utils.cc @@ -0,0 +1,223 @@ +/* + * This file is part of libcxxsupport. + * + * libcxxsupport is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * libcxxsupport is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with libcxxsupport; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* + * libcxxsupport is being developed at the Max-Planck-Institut fuer Astrophysik + * and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt + * (DLR). + */ + +/* + * This file contains the implementation of various convenience functions + * used by the Planck LevelS package. + * + * Copyright (C) 2002-2021 Max-Planck-Society + * Author: Martin Reinecke + */ + +#include +#include +#include +#include +#include +#include +#include +#include "ducc0/infra/string_utils.h" +#include "ducc0/infra/error_handling.h" + +namespace ducc0 { + +namespace detail_string_utils { + +using namespace std; + +string trim (const string &orig) + { + string::size_type p1=orig.find_first_not_of(" \t"); + if (p1==string::npos) return ""; + string::size_type p2=orig.find_last_not_of(" \t"); + return orig.substr(p1,p2-p1+1); + } + +template string dataToString (const T &x) + { + ostringstream strstrm; + strstrm << x; + return trim(strstrm.str()); + } + +template<> string dataToString (const bool &x) + { return x ? "T" : "F"; } +template<> string dataToString (const string &x) + { return trim(x); } +template<> string dataToString (const float &x) + { + ostringstream strstrm; + strstrm << setprecision(8) << x; + return trim(strstrm.str()); + } +template<> string dataToString (const double &x) + { + ostringstream strstrm; + strstrm << setprecision(16) << x; + return trim(strstrm.str()); + } +template<> string dataToString (const long double &x) + { + ostringstream strstrm; + strstrm << setprecision(25) << x; + return trim(strstrm.str()); + } + +template string dataToString (const signed char &x); +template string dataToString (const unsigned char &x); +template string dataToString (const short &x); +template string dataToString (const unsigned short &x); +template string dataToString (const int &x); +template string dataToString (const unsigned int &x); +template string dataToString (const long &x); +template string dataToString (const unsigned long &x); +template string dataToString (const long long &x); +template string dataToString (const unsigned long long &x); + +string intToString(int64_t x, size_t width) + { + ostringstream strstrm; + (x>=0) ? strstrm << setw(width) << setfill('0') << x + : strstrm << "-" << setw(width-1) << setfill('0') << -x; + string res = strstrm.str(); + MR_assert(res.size()==width,"number too large"); + return trim(res); + } + +template T stringToData (const string &x) + { + istringstream strstrm(x); + T value; + strstrm >> value; + bool ok = bool(strstrm); + if (ok) + { + string rest; + strstrm >> rest; + ok = rest.length()==0; + } + MR_assert(ok, "could not convert '", x, "' to desired data type."); + return value; + } + +template<> string stringToData (const string &x) + { return trim(x); } + +template<> bool stringToData (const string &x) + { + const char *fval[] = {"f","n","false",".false."}; + const char *tval[] = {"t","y","true",".true."}; + for (size_t i=0; i< sizeof(fval)/sizeof(fval[0]); ++i) + if (equal_nocase(x,fval[i])) return false; + for (size_t i=0; i< sizeof(tval)/sizeof(tval[0]); ++i) + if (equal_nocase(x,tval[i])) return true; + MR_fail("conversion error in stringToData(",x,")"); + } + +template signed char stringToData (const string &x); +template unsigned char stringToData (const string &x); +template short stringToData (const string &x); +template unsigned short stringToData (const string &x); +template int stringToData (const string &x); +template unsigned int stringToData (const string &x); +template long stringToData (const string &x); +template unsigned long stringToData (const string &x); +template long long stringToData (const string &x); +template unsigned long long stringToData (const string &x); +template float stringToData (const string &x); +template double stringToData (const string &x); +template long double stringToData (const string &x); + +bool equal_nocase (const string &a, const string &b) + { + if (a.size()!=b.size()) return false; + for (size_t m=0; m vector split (istream &stream) + { + vector list; + while (stream) + { + string word; + stream >> word; + MR_assert (stream||stream.eof(), + "error while splitting stream into components"); + if (stream) list.push_back(stringToData(word)); + } + return list; + } + +} // unnamed namespace + +template vector split (const string &inp) + { + istringstream is(inp); + return split(is); + } + +template vector split (const string &inp); +template vector split (const string &inp); +template vector split (const string &inp); +template vector split (const string &inp); +template vector split (const string &inp); + +vector tokenize (const string &inp, char delim) + { + istringstream stream(inp); + string token; + vector list; + while (getline(stream,token,delim)) + list.push_back(token); + return list; + } + +vector parse_words_from_file (const string &filename) + { + vector words; + ifstream inp(filename.c_str()); + MR_assert (inp,"Could not open file '", filename, "'."); + while (inp) + { + string word; + inp>>word; + word=trim(word); + if (word!="") words.push_back(word); + } + return words; + } + +}} diff --git a/contrib/ducc0/infra/string_utils.h b/contrib/ducc0/infra/string_utils.h new file mode 100644 index 000000000..e95ef4672 --- /dev/null +++ b/contrib/ducc0/infra/string_utils.h @@ -0,0 +1,99 @@ +/* + * This file is part of the MR utility library. + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/** \file ducc0/infra/string_utils.h + * + * \copyright Copyright (C) 2019-2021 Max-Planck-Society + * \author Martin Reinecke + */ + +#ifndef DUCC0_STRING_UTILS_H +#define DUCC0_STRING_UTILS_H + +// FIXME: most of this will be superseded by C++20 std::format + +#include +#include +#include +#include + +namespace ducc0 { + +namespace detail_string_utils { + +/*! \defgroup stringutilsgroup String handling helper functions */ +/*! \{ */ + +/// Returns the string \a orig without leading and trailing whitespace. +std::string trim (const std::string &orig); + +/// Returns a string containing the text representation of \a x. +/*! Care is taken that no information is lost in the conversion. */ +template std::string dataToString(const T &x); +template<> std::string dataToString (const bool &x); +template<> std::string dataToString (const std::string &x); +template<> std::string dataToString (const float &x); +template<> std::string dataToString (const double &x); +template<> std::string dataToString (const long double &x); + +/// Returns a string containing the text representation of \a x, padded +/// with leading zeroes to \a width characters. +std::string intToString(std::int64_t x, std::size_t width); + +/// Reads a value of a given datatype from a string. +template T stringToData (const std::string &x); +template<> std::string stringToData (const std::string &x); +template<> bool stringToData (const std::string &x); + +/// Case-insensitive string comparison +/*! Returns \a true, if \a a and \a b differ only in capitalisation, + else \a false. */ +bool equal_nocase (const std::string &a, const std::string &b); + +/// Returns lowercase version of \a input. +std::string tolower(const std::string &input); + +/// Tries to split \a inp into a white-space separated list of values of +/// type \a T, and appends them to \a list. +template inline std::vector split (const std::string &inp); + +/// Breaks the string \a inp into tokens separated by \a delim, and returns them +/// as a vector. +std::vector tokenize (const std::string &inp, char delim); + +/// Breaks the contents of file \a filename into tokens separated by white +/// space, and returns them as a vector. +std::vector parse_words_from_file (const std::string &filename); + +/*! \} */ + +} + +using detail_string_utils::trim; +//using detail_string_utils::intToString; +using detail_string_utils::dataToString; +using detail_string_utils::stringToData; +using detail_string_utils::equal_nocase; +//using detail_string_utils::tolower; +//using detail_string_utils::split; +//using detail_string_utils::tokenize; +//using detail_string_utils::parse_words_from_file; + +} + +#endif diff --git a/contrib/ducc0/infra/threading.cc b/contrib/ducc0/infra/threading.cc new file mode 100644 index 000000000..77c04b50d --- /dev/null +++ b/contrib/ducc0/infra/threading.cc @@ -0,0 +1,759 @@ +/** \file ducc0/infra/threading.cc + * + * \copyright Copyright (C) 2019-2023 Peter Bell, Max-Planck-Society + * \authors Peter Bell, Martin Reinecke + */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "ducc0/infra/threading.h" +#include "ducc0/infra/error_handling.h" +#include "ducc0/infra/misc_utils.h" +#include "ducc0/infra/string_utils.h" +#include +#include +#include + +#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __has_include() +#include +#if __has_include() && defined(__linux__) && defined(_GNU_SOURCE) +#include +#endif +#endif +#endif + +namespace ducc0 { + +namespace detail_threading { + +class latch + { + std::atomic num_left_; + Mutex mut_; + CondVar completed_; + using lock_t = UniqueLock; + + public: + latch(size_t n): num_left_(n) {} + + void count_down() + { + lock_t lock(mut_); + if (--num_left_) + return; + completed_.notify_all(); + } + + void wait() + { + lock_t lock(mut_); + completed_.wait(lock, [this]{ return is_ready(); }); + } + bool is_ready() { return num_left_ == 0; } + }; + +#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING + +size_t ducc0_max_threads() + { + static const size_t max_threads_ = []() + { +#if __has_include() && defined(__linux__) && defined(_GNU_SOURCE) + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset); + size_t res=0; + for (size_t i=0; i(1, std::thread::hardware_concurrency()); +#endif + auto evar=getenv("DUCC0_NUM_THREADS"); + // fallback + if (!evar) + evar=getenv("OMP_NUM_THREADS"); + if (!evar) + return res; + auto res2 = stringToData(trim(std::string(evar))); + MR_assert(res2>=0, "invalid value in DUCC0_NUM_THREADS/OMP_NUM_THREADS"); + if (res2==0) + return res; + return std::min(res, res2); + }(); + return max_threads_; + } + +static thread_local bool in_parallel_region = false; +int pin_info() + { + static const int pin_info_ = []() + { + auto evar=getenv("DUCC0_PIN_DISTANCE"); + if (!evar) + return -1; // do nothing at all + auto res = stringToData(trim(std::string(evar))); + return int(res); + }(); + return pin_info_; + } +int pin_offset() + { + static const int pin_offset_ = []() + { + auto evar=getenv("DUCC0_PIN_OFFSET"); + if (!evar) + return 0; + auto res = stringToData(trim(std::string(evar))); + return int(res); + }(); + return pin_offset_; + } + +template class concurrent_queue + { + std::queue q_; + Mutex mut_; + std::atomic size_=0; + using lock_t = LockGuard; + + public: + void push(T val) + { + lock_t lock(mut_); + ++size_; + q_.push(std::move(val)); + } + + bool try_pop(T &val) + { + if (size_==0) return false; + lock_t lock(mut_); + // Queue might have been emptied while we acquired the lock + if (q_.empty()) return false; + + val = std::move(q_.front()); + --size_; + q_.pop(); + return true; + } + + bool empty() const { return size_==0; } + }; + +#if __has_include() && defined(__linux__) && defined(_GNU_SOURCE) +static void do_pinning(int ithread) + { + if (pin_info()==-1) return; + int num_proc = sysconf(_SC_NPROCESSORS_ONLN); + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + int cpu_wanted = pin_offset() + ithread*pin_info(); + MR_assert((cpu_wanted>=0)&&(cpu_wanted work; + + void worker_main( + std::atomic &shutdown_flag, + std::atomic &unscheduled_tasks, + concurrent_queue> &overflow_work, size_t ithread) + { + in_parallel_region = true; + do_pinning(ithread); + using lock_t = UniqueLock; + bool expect_work = true; + while (!shutdown_flag || expect_work) + { + std::function local_work; + if (expect_work || unscheduled_tasks == 0) + { + lock_t lock(mut); + // Wait until there is work to be executed + work_ready.wait(lock, [&]{ return (work || shutdown_flag); }); + local_work.swap(work); + expect_work = false; + } + + bool marked_busy = false; + if (local_work) + { + marked_busy = true; + local_work(); + } + + if (!overflow_work.empty()) + { + if (!marked_busy && busy_flag.test_and_set()) + { + expect_work = true; + continue; + } + marked_busy = true; + + while (overflow_work.try_pop(local_work)) + { + --unscheduled_tasks; + local_work(); + } + } + + if (marked_busy) busy_flag.clear(); + } + } + }; + + concurrent_queue> overflow_work_; + Mutex mut_; + std::vector workers_; + std::atomic shutdown_=false; + std::atomic unscheduled_tasks_=0; + using lock_t = LockGuard; + + void create_threads() + { + lock_t lock(mut_); + size_t nthreads=workers_.size(); + for (size_t i=0; ibusy_flag.clear(); + worker->work = nullptr; + worker->thread = std::thread( + [worker, this, i]{ worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_, i); }); + } + catch (...) + { + shutdown_locked(); + throw; + } + } + } + + void shutdown_locked() + { + shutdown_ = true; + for (auto &worker : workers_) + worker.work_ready.notify_all(); + + for (auto &worker : workers_) + if (worker.thread.joinable()) + worker.thread.join(); + } + + public: + explicit ducc_thread_pool(size_t nthreads): + workers_(nthreads) + { create_threads(); } + + //virtual + ~ducc_thread_pool() { shutdown(); } + + //virtual + size_t nthreads() const { return workers_.size(); } + + //virtual + size_t adjust_nthreads(size_t nthreads_in) const + { + if (in_parallel_region) + return 1; + if (nthreads_in==0) + return ducc0_max_threads(); + return std::min(ducc0_max_threads(), nthreads_in); + } + //virtual + void submit(std::function work) + { + lock_t lock(mut_); + if (shutdown_) + throw std::runtime_error("Work item submitted after shutdown"); + + ++unscheduled_tasks_; + + // First check for any idle workers and wake those + for (auto &worker : workers_) + if (!worker.busy_flag.test_and_set()) + { + --unscheduled_tasks_; + { + lock_t lock(worker.mut); + worker.work = std::move(work); + worker.work_ready.notify_one(); + } + return; + } + + // If no workers were idle, push onto the overflow queue for later + overflow_work_.push(std::move(work)); + } + + void shutdown() + { + lock_t lock(mut_); + shutdown_locked(); + } + + void restart() + { + shutdown_ = false; + create_threads(); + } + }; + +// return a pointer to a singleton thread_pool, which is always available +inline ducc_thread_pool *get_master_pool() + { + static auto master_pool = new ducc_thread_pool(ducc0_max_threads()-1); +#if __has_include() + static std::once_flag f; + call_once(f, + []{ + pthread_atfork( + +[]{ get_master_pool()->shutdown(); }, // prepare + +[]{ get_master_pool()->restart(); }, // parent + +[]{ get_master_pool()->restart(); } // child + ); + }); +#endif + return master_pool; + } + +thread_local thread_pool *active_pool = get_master_pool(); + +thread_pool *set_active_pool(thread_pool *new_pool) + { return std::exchange(active_pool, new_pool); } +thread_pool *get_active_pool() + { + if (!active_pool) active_pool = get_master_pool(); + MR_assert(active_pool, "no thread pool active"); + return active_pool; + } + +#endif + +#ifdef DUCC0_NO_LOWLEVEL_THREADING + +class ducc_pseudo_thread_pool: public thread_pool + { + public: + ducc_pseudo_thread_pool() {} + + //virtual + size_t nthreads() const { return 1; } + + //virtual + size_t adjust_nthreads(size_t /*nthreads_in*/) const + { return 1; } + //virtual + void submit(std::function work) + { work(); } + }; + +// return a pointer to a singleton thread_pool, which is always available +inline ducc_pseudo_thread_pool *get_master_pool() + { + static auto master_pool = new ducc_pseudo_thread_pool(); + return master_pool; + } + +thread_local thread_pool *active_pool = get_master_pool(); + +thread_pool *set_active_pool(thread_pool *new_pool) + { return std::exchange(active_pool, new_pool); } +thread_pool *get_active_pool() + { + MR_assert(active_pool!=nullptr, "no thread pool active"); + return active_pool; + } + +#endif + +size_t max_threads() + { return get_active_pool()->nthreads()+1; } +size_t adjust_nthreads(size_t nthreads_in) + { return get_active_pool()->adjust_nthreads(nthreads_in); } + +class Distribution + { + private: + size_t nthreads_; + Mutex mut_; + size_t nwork_; + size_t cur_; + std::atomic cur_dynamic_; + size_t chunksize_; + double fact_max_; + struct alignas(64) spaced_size_t { size_t v; }; + std::vector nextstart; + enum SchedMode { SINGLE, STATIC, DYNAMIC, GUIDED }; + SchedMode mode; + bool single_done; + + void thread_map(std::function f); + + public: + size_t nthreads() const { return nthreads_; } + + void execSingle(size_t nwork, std::function f) + { + mode = SINGLE; + single_done = false; + nwork_ = nwork; + nthreads_ = 1; + thread_map(std::move(f)); + } + void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function f) + { + mode = STATIC; + nthreads_ = adjust_nthreads(nthreads); + nwork_ = nwork; + chunksize_ = (chunksize<1) ? (nwork_+nthreads_-1)/nthreads_ + : chunksize; + if (chunksize_>=nwork_) + return execSingle(nwork_, std::move(f)); +// if there are fewer chunks than threads, reduce nthreads + nthreads_ = std::min(nthreads_, (nwork_+chunksize_-1)/chunksize_); + nextstart.resize(nthreads_); + for (size_t i=0; i f) + { + mode = DYNAMIC; + nthreads_ = adjust_nthreads(nthreads); + nwork_ = nwork; + chunksize_ = (chunksize<1) ? 1 : chunksize; + if (chunksize_ >= nwork) + return execSingle(nwork, std::move(f)); + if (chunksize_*nthreads_>=nwork_) + return execStatic(nwork, nthreads, chunksize_, std::move(f)); + cur_dynamic_ = 0; + thread_map(std::move(f)); + } + void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function f) + { + mode = GUIDED; + nthreads_ = adjust_nthreads(nthreads); + nwork_ = nwork; + chunksize_ = (chunksize_min<1) ? 1 : chunksize_min; + if (chunksize_*nthreads_>=nwork_) + return execStatic(nwork, nthreads, chunksize_, std::move(f)); + fact_max_ = fact_max; + cur_ = 0; + thread_map(std::move(f)); + } + void execParallel(size_t nthreads, std::function f) + { + mode = STATIC; + nthreads_ = adjust_nthreads(nthreads); + nwork_ = nthreads_; + chunksize_ = 1; + thread_map(std::move(f)); + } + Range getNext(size_t thread_id) + { + switch (mode) + { + case SINGLE: + { + if (single_done) return Range(); + single_done=true; + return Range(0, nwork_); + } + case STATIC: + { + if (nextstart[thread_id].v>=nwork_) return Range(); + size_t lo=nextstart[thread_id].v; + size_t hi=std::min(lo+chunksize_,nwork_); + nextstart[thread_id].v += nthreads_*chunksize_; + return Range(lo, hi); + } + case DYNAMIC: + { + auto curval = cur_dynamic_.fetch_add(chunksize_); + return Range(std::min(curval, nwork_), + std::min(curval+chunksize_, nwork_)); + } + case GUIDED: + { + LockGuard lck(mut_); + if (cur_>=nwork_) return Range(); + auto rem = nwork_-cur_; + size_t tmp = size_t((fact_max_*double(rem))/double(nthreads_)); + auto sz = std::min(rem, std::max(chunksize_, tmp)); + size_t lo=cur_; + cur_+=sz; + size_t hi=cur_; + return Range(lo, hi); + } + } + return Range(); + } + }; + +class MyScheduler: public Scheduler + { + private: + Distribution &dist_; + size_t ithread_; + + public: + MyScheduler(Distribution &dist, size_t ithread) + : dist_(dist), ithread_(ithread) {} + virtual size_t num_threads() const { return dist_.nthreads(); } + virtual size_t thread_num() const { return ithread_; } + virtual Range getNext() { return dist_.getNext(ithread_); } + }; + +template class ScopedValueChanger + { + private: + T &object; + T original_value; + + public: + ScopedValueChanger(T &object_, T new_value) + : object(object_), original_value(object_) { object=new_value; } + ~ScopedValueChanger() + { object=original_value; } + }; + +#define DUCC0_HIERARCHICAL_SUBMISSION +#ifdef DUCC0_HIERARCHICAL_SUBMISSION + +// The next two definitions are taken from TensorFlow sources. +// Copyright 2015 The TensorFlow Authors. + +// Basic y-combinator implementation. +template struct YCombinatorImpl { + Func func; + template + decltype(auto) operator()(Args&&... args) const { + return func(*this, std::forward(args)...); + } +}; + +template YCombinatorImpl> YCombinator(Func&& func) { + return YCombinatorImpl>{std::forward(func)}; +} + +#endif + +void Distribution::thread_map(std::function f) + { + if (nthreads_ == 1) + { + MyScheduler sched(*this, 0); + f(sched); + return; + } + + std::exception_ptr ex; + Mutex ex_mut; + // we "copy" the currently active thread pool to all executing threads + // during the execution of f. This ensures that possible nested parallel + // regions are handled by the same pool and not by the one that happens + // to be active on the worker threads. + // Alternatively we could put a "no-threading" thread pool onto the executing + // threads, which executes everything sequentially on its own thread, + // automatically prohibiting nested parallelism. + auto pool = get_active_pool(); + +#ifdef DUCC0_HIERARCHICAL_SUBMISSION + + latch counter(nthreads_); + // distribute work to helper threads, in a recursive fashion + auto new_f = YCombinator([this, &f, &counter, &ex, &ex_mut, pool](auto &new_f, size_t istart, size_t step) -> void { + try + { + ScopedValueChanger changer(in_parallel_region, true); + ScopedUseThreadPool guard(*pool); + for(; step>0; step>>=1) + if(istart+stepsubmit([&new_f, istart, step]() + {new_f(istart+step, step>>1);}); + MyScheduler sched(*this, istart); + f(sched); + } + catch (...) + { + LockGuard lock(ex_mut); + ex = std::current_exception(); + } + counter.count_down(); + }); + + size_t biggest_step=1; + while (biggest_step*2submit( + [this, &f, i, &counter, &ex, &ex_mut, pool] { + try + { + ScopedUseThreadPool guard(*pool); + MyScheduler sched(*this, i); + f(sched); + } + catch (...) + { + LockGuard lock(ex_mut); + ex = std::current_exception(); + } + counter.count_down(); + }); + } + { + // do remaining work directly on this thread + ScopedValueChanger changer(in_parallel_region, true); + MyScheduler sched(*this, 0); + f(sched); + } + +#endif +#undef DUCC0_HIERARCHICAL_SUBMISSION + + counter.wait(); + if (ex) + std::rethrow_exception(ex); + } + +void execSingle(size_t nwork, std::function func) + { + Distribution dist; + dist.execSingle(nwork, std::move(func)); + } +void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func) + { + Distribution dist; + dist.execStatic(nwork, nthreads, chunksize, std::move(func)); + } +void execDynamic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func) + { + Distribution dist; + dist.execDynamic(nwork, nthreads, chunksize, std::move(func)); + } +void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function func) + { + Distribution dist; + dist.execGuided(nwork, nthreads, chunksize_min, fact_max, std::move(func)); + } +void execParallel(size_t nthreads, std::function func) + { + Distribution dist; + dist.execParallel(nthreads, std::move(func)); + } +void execParallel(size_t nthreads, std::function func) + { + Distribution dist; + dist.execParallel(nthreads, [&](Scheduler &sched) + { func(sched.thread_num()); }); + } +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func) + { + nthreads = adjust_nthreads(nthreads); + execParallel(nthreads, [&](Scheduler &sched) + { + auto tid = sched.thread_num(); + auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi); + func(lo, hi); + }); + } +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func) + { + nthreads = adjust_nthreads(nthreads); + execParallel(nthreads, [&](Scheduler &sched) + { + auto tid = sched.thread_num(); + auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi); + func(tid, lo, hi); + }); + } + +}} diff --git a/contrib/ducc0/infra/threading.h b/contrib/ducc0/infra/threading.h new file mode 100644 index 000000000..21fb8300f --- /dev/null +++ b/contrib/ducc0/infra/threading.h @@ -0,0 +1,329 @@ +/** \file ducc0/infra/threading.h + * Mulithreading support, similar to functionality provided by OpenMP + * + * \copyright Copyright (C) 2019-2023 Peter Bell, Max-Planck-Society + * \authors Peter Bell, Martin Reinecke + */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_THREADING_H +#define DUCC0_THREADING_H + +// Low level threading support can be influenced by the following macros: +// - DUCC0_NO_LOWLEVEL_THREADING: if defined, multithreading is disabled +// and all parallel regions will be executed sequentially +// on the invoking thread. +// - DUCC0_CUSTOM_LOWLEVEL_THREADING: if defined, external definitions of +// Mutex, UniqueLock, LockGuard, CondVar, set_active_pool(), +// and get active_pool() must be supplied in "ducc0_custom_lowlevel_threading.h" +// and the code will use those. +// Both macros must not be defined at the same time. +// If neither macro is defined, standard ducc0 multihreading will be active. + +#if (defined(DUCC0_NO_LOWLEVEL_THREADING) && defined(DUCC0_CUSTOM_LOWLEVEL_THREADING)) +static_assert(false, "DUCC0_NO_LOWLEVEL_THREADING and DUCC0_CUSTOMLOWLEVEL_THREADING must not be both defined"); +#endif + +#if defined(DUCC0_STDCXX_LOWLEVEL_THREADING) +static_assert(false, "DUCC0_STDCXX_LOWLEVEL_THREADING must not be defined externally"); +#endif + +#if ((!defined(DUCC0_NO_LOWLEVEL_THREADING)) && (!defined(DUCC0_CUSTOM_LOWLEVEL_THREADING))) +#define DUCC0_STDCXX_LOWLEVEL_THREADING +#endif + +#include +#include +#include +#include + +// threading-specific headers +#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING +#include +#include +#endif + +#ifdef DUCC0_NO_LOWLEVEL_THREADING +// no headers needed +#endif + +namespace ducc0 { +namespace detail_threading { + +using std::size_t; + +/// Abstract base class for minimalistic thread pool functionality +class thread_pool + { + public: + virtual ~thread_pool() {} + /// Returns the total number of threads managed by the pool + virtual size_t nthreads() const = 0; + /** "Normalizes" a requested number of threads. A useful convention could be + return (nthreads_in==0) ? nthreads() : min(nthreads(), nthreads_in); */ + virtual size_t adjust_nthreads(size_t nthreads_in) const = 0; + virtual void submit(std::function work) = 0; + }; + +}} + +#ifdef DUCC0_CUSTOM_LOWLEVEL_THREADING +#include "ducc0_custom_lowlevel_threading.h" +#endif + +namespace ducc0 { + +namespace detail_threading { + +thread_pool *set_active_pool(thread_pool *new_pool); +thread_pool *get_active_pool(); + +// define threading related types dependent on the underlying implementation +#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING +using Mutex = std::mutex; +using UniqueLock = std::unique_lock; +using LockGuard = std::lock_guard; +using CondVar = std::condition_variable; +#endif + +#ifdef DUCC0_NO_LOWLEVEL_THREADING +struct Mutex + { + void lock(){} + void unlock(){} + }; +struct LockGuard + { + LockGuard(const Mutex &){} + }; +struct UniqueLock + { + UniqueLock(const Mutex &){} + void lock() {} + void unlock() {} + }; +struct CondVar + { + template + void wait(UniqueLock &, Predicate) {} + void notify_one() noexcept {} + void notify_all() noexcept {} + }; +#endif + +using std::size_t; + +class ScopedUseThreadPool + { + private: + thread_pool *old_pool_; + public: + ScopedUseThreadPool(thread_pool &pool) + { old_pool_ = set_active_pool(&pool); } + ~ScopedUseThreadPool() + { set_active_pool(old_pool_); } + }; + +/// Index range describing a chunk of work inside a parallelized loop +struct Range + { + size_t lo, //< first index of the chunk + hi; //< one-past-last index of the chunk + Range() : lo(0), hi(0) {} + Range(size_t lo_, size_t hi_) : lo(lo_), hi(hi_) {} + /// Returns true iff the chunk is not empty + operator bool() const { return hi>lo; } + }; + +/// Class supplied to parallel regions, which allows them to determine their +/// work chunks. +class Scheduler + { + public: + virtual ~Scheduler() {} + /// Returns the number of threads working in this parallel region + virtual size_t num_threads() const = 0; + /// Returns the number of this thread, from the range 0 to num_threads()-1. + virtual size_t thread_num() const = 0; + /// Returns information about the next chunk of work. + /// If this chunk is empty, the work on this thread is done. + virtual Range getNext() = 0; + }; + +/** Returns the maximum number of threads that are supported by currently + active thread pool. */ +size_t max_threads(); +size_t adjust_nthreads(size_t nthreads); + +/// Execute \a func over \a nwork work items, on a single thread. +void execSingle(size_t nwork, + std::function func); +/// Execute \a func over \a nwork work items, on \a nthreads threads. +/** Chunks will have the size \a chunksize, except for the last one which + * may be smaller. + * + * Chunks are statically assigned to threads at startup. */ +void execStatic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func); +/// Execute \a func over \a nwork work items, on \a nthreads threads. +/** Chunks will have the size \a chunksize, except for the last one which + * may be smaller. + * + * Chunks are assigned dynamically to threads;whenever a thread is finished + * with its current chunk, it will obtain the next one from the list of + * remaining chunks. */ +void execDynamic(size_t nwork, size_t nthreads, size_t chunksize, + std::function func); +void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min, + double fact_max, std::function func); +/// Execute \a func on \a nthreads threads. +/** Work subdivision must be organized within \a func. */ +void execParallel(size_t nthreads, std::function func); +/// Execute \a func on \a nthreads threads, passing only the thread number. +/** Work subdivision must be organized within \a func. */ +void execParallel(size_t nthreads, std::function func); +/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads. +/** Work items are subdivided fairly among threads. */ +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func); +/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads. +/** Work items are subdivided fairly among threads. */ +inline void execParallel(size_t nwork, size_t nthreads, + std::function func) + { execParallel(0, nwork, nthreads, func); } +/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads. +/** The first argument to \a func is the thread number. + * + * Work items are subdivided fairly among threads. */ +void execParallel(size_t work_lo, size_t work_hi, size_t nthreads, + std::function func); +/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads. +/** The first argument to \a func is the thread number. + * + * Work items are subdivided fairly among threads. */ +inline void execParallel(size_t nwork, size_t nthreads, + std::function func) + { execParallel(0, nwork, nthreads, func); } + +template class Worklist + { + private: + Mutex mtx; + CondVar cv; + size_t nworking{0}; + std::vector items; + + public: + Worklist(const std::vector &items_) + : items(items_) {} + + std::optional get_item() + { + UniqueLock lck(mtx); + if ((--nworking==0) && items.empty()) cv.notify_all(); + cv.wait(lck,[&](){return (!items.empty()) || (nworking==0);}); + if (!items.empty()) + { + auto res = items.back(); + items.pop_back(); + ++nworking; + return res; + } + else + return {}; + } + void startup() + { + LockGuard lck(mtx); + ++nworking; + } + void put_item(const T &item) + { + LockGuard lck(mtx); + items.push_back(item); + cv.notify_one(); + } + }; + +/// Execute \a func on work items in \a items over \a nthreads threads. +/** While processing a work item, \a func may submit further items to the list + * of work items. For this purpose, \a func must take a const T & + * (the work item to be processed) as well as a function which also takes + * a const T & (the insert function). Work items will be assigned whenever a + * thread becomes available. */ +template auto execWorklist + (size_t nthreads, const std::vector &items, Func &&func) + { + Worklist wl(items); + execParallel(nthreads, [&wl, &func](auto &) { + wl.startup(); + while(auto wrk=wl.get_item()) + func(wrk.value(), [&wl](const T &item){wl.put_item(item);}); + }); + } + +} // end of namespace detail_threading + +using detail_threading::Mutex; +using detail_threading::LockGuard; +using detail_threading::UniqueLock; +using detail_threading::CondVar; +using detail_threading::thread_pool; +using detail_threading::ScopedUseThreadPool; +using detail_threading::max_threads; +using detail_threading::adjust_nthreads; +using detail_threading::Scheduler; +using detail_threading::execSingle; +using detail_threading::execStatic; +using detail_threading::execDynamic; +using detail_threading::execGuided; +using detail_threading::execParallel; +using detail_threading::execWorklist; + +} // end of namespace ducc0 + +#endif diff --git a/contrib/ducc0/infra/useful_macros.h b/contrib/ducc0/infra/useful_macros.h new file mode 100644 index 000000000..eaef26779 --- /dev/null +++ b/contrib/ducc0/infra/useful_macros.h @@ -0,0 +1,74 @@ +/* +This file is part of the ducc library. + +Copyright (C) 2010-2022 Max-Planck-Society + +Author: Martin Reinecke +*/ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_USEFUL_MACROS_H +#define DUCC0_USEFUL_MACROS_H + +#if defined(__GNUC__) +#define DUCC0_NOINLINE [[gnu::noinline]] +#define DUCC0_RESTRICT __restrict__ +#define DUCC0_PREFETCH_R(addr) __builtin_prefetch(addr); +#define DUCC0_PREFETCH_W(addr) __builtin_prefetch(addr,1); +#elif defined(_MSC_VER) +#define DUCC0_NOINLINE __declspec(noinline) +#define DUCC0_RESTRICT __restrict +#define DUCC0_PREFETCH_R(addr) +#define DUCC0_PREFETCH_W(addr) +#else +#define DUCC0_NOINLINE +#define DUCC0_RESTRICT +#define DUCC0_PREFETCH_R(addr) +#define DUCC0_PREFETCH_W(addr) +#endif + +#endif diff --git a/contrib/ducc0/math/cmplx.h b/contrib/ducc0/math/cmplx.h new file mode 100644 index 000000000..522a3bdda --- /dev/null +++ b/contrib/ducc0/math/cmplx.h @@ -0,0 +1,108 @@ +/** \file ducc0/math/cmplx.h + * Minimalistic complex number class + * + * \copyright Copyright (C) 2019-2023 Max-Planck-Society + * \author Martin Reinecke + */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_CMPLX_H +#define DUCC0_CMPLX_H + +namespace ducc0 { + +/// Very basic class representing complex numbers +/** Meant exclusively for internal low-level use, e.g. in FFT routines. */ +template struct Cmplx { + T r, i; + Cmplx() {} + constexpr Cmplx(T r_, T i_) : r(r_), i(i_) {} + constexpr Cmplx(T r_) : r(r_), i(T(0)) {} + void Set(T r_, T i_) { r=r_; i=i_; } + void Set(T r_) { r=r_; i=T(0); } + void Split(T &r_, T &i_) const { r_=r; i_=i; } + void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; } + Cmplx &operator+= (const Cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templateCmplx &operator*= (T2 other) + { r*=other; i*=other; return *this; } + templateCmplx &operator*= (const Cmplx &other) + { + T tmp = r*other.r - i*other.i; + i = r*other.i + i*other.r; + r = tmp; + return *this; + } + Cmplx conj() const { return {r, -i}; } + templateCmplx &operator+= (const Cmplx &other) + { r+=other.r; i+=other.i; return *this; } + templateCmplx &operator-= (const Cmplx &other) + { r-=other.r; i-=other.i; return *this; } + template auto operator* (const T2 &other) const + -> Cmplx + { return {r*other, i*other}; } + template auto operator+ (const Cmplx &other) const + -> Cmplx + { return {r+other.r, i+other.i}; } + template auto operator- (const Cmplx &other) const + -> Cmplx + { return {r-other.r, i-other.i}; } + template auto operator* (const Cmplx &other) const + -> Cmplx + { return {r*other.r-i*other.i, r*other.i + i*other.r}; } + template auto special_mul (const Cmplx &other) const + -> Cmplx + { + using Tres = Cmplx; + return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i) + : Tres(r*other.r-i*other.i, r*other.i+i*other.r); + } + }; + +} + +#endif diff --git a/contrib/ducc0/math/unity_roots.h b/contrib/ducc0/math/unity_roots.h new file mode 100644 index 000000000..09df542b2 --- /dev/null +++ b/contrib/ducc0/math/unity_roots.h @@ -0,0 +1,241 @@ +/* Copyright (C) 2019-2021 Max-Planck-Society + Author: Martin Reinecke */ + +/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */ + +/* +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. +* Neither the name of the copyright holder nor the names of its contributors may + be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This code is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this code; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef DUCC0_UNITY_ROOTS_H +#define DUCC0_UNITY_ROOTS_H + +#include +#include +#include +#include + +namespace ducc0 { + +namespace detail_unity_roots { + +using namespace std; + +template class UnityRoots + { + private: + using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; + struct cmplx_ { Thigh r, i; }; + size_t N, mask, shift; + vector v1, v2; + + static cmplx_ calc(size_t x, size_t n, Thigh ang) + { + x<<=3; + if (x<4*n) // first half + { + if (x<2*n) // first quadrant + { + if (x>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + idx = N-idx; + auto x1=v1[idx&mask], x2=v2[idx>>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r)); + } + }; + +template class MultiExp + { + private: + using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type; + struct cmplx_ { Thigh r, i; }; + size_t N, mask, shift; + vector v1, v2; + + public: + MultiExp(T ang0, size_t n) + : N(n) + { + Thigh ang = ang0; + size_t nval = n+2; + shift = 1; + while((size_t(1)<>shift]; + return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r)); + } + }; + +} + +using detail_unity_roots::UnityRoots; +using detail_unity_roots::MultiExp; + +} + +#endif diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp index eb7189da0..bc9b36b29 100644 --- a/examples/guru1d1.cpp +++ b/examples/guru1d1.cpp @@ -20,9 +20,9 @@ int main(int argc, char* argv[]) Barnett 2/27/20 Compile on linux with (or see ../makefile): - g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 -lfftw3 -lfftw3_omp -lm + g++-7 -std=c++17 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 - Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp + Or if you have built a single-core library, remove -fopenmp Usage: ./guru1d1 */ diff --git a/examples/guru1d1c.c b/examples/guru1d1c.c index 7ad036f4b..9be63a0d5 100644 --- a/examples/guru1d1c.c +++ b/examples/guru1d1c.c @@ -12,9 +12,9 @@ int main(int argc, char* argv[]) C complex type, with a math check. Barnett 6/22/20. Compile on linux with: - gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c -lfftw3 -lfftw3_omp -lm -lstdc++ + gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c -lm -lstdc++ - Or if you have built a single-core library, remove -fopenmp and -lfftw3_omp + Or if you have built a single-core library, remove -fopenmp Usage: ./guru1d1c. See also: guru1d1 */ diff --git a/examples/guru1d1f.cpp b/examples/guru1d1f.cpp index a46c4a735..72b706d77 100644 --- a/examples/guru1d1f.cpp +++ b/examples/guru1d1f.cpp @@ -19,9 +19,9 @@ int main(int argc, char* argv[]) Barnett 7/5/20 Compile on linux with: - g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f -lfftw3f -lfftw3f_omp -lm + g++-7 -std=c++17 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f - Or if you have built a single-core library, remove -fopenmp and -lfftw3f_omp + Or if you have built a single-core library, remove -fopenmp Usage: ./guru1d1f */ diff --git a/examples/guru2d1.cpp b/examples/guru2d1.cpp index 06d25e064..13a29928b 100644 --- a/examples/guru2d1.cpp +++ b/examples/guru2d1.cpp @@ -13,9 +13,9 @@ int main(int argc, char *argv[]){ except illustrates the guru interface. Compile multithreaded with - g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lfftw3_omp -lm + g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 single core with: - g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm + g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 Usage: ./guru2d1 */ diff --git a/examples/many1d1.cpp b/examples/many1d1.cpp index 8176007c9..353db1504 100644 --- a/examples/many1d1.cpp +++ b/examples/many1d1.cpp @@ -12,9 +12,9 @@ int main(int argc, char* argv[]) double complex vectors, with a math test. Compile with: - g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lfftw3_omp -lm + g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 or if you have built a single-core version: - g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm + g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 Usage: ./many1d1 */ diff --git a/examples/simple1d1.cpp b/examples/simple1d1.cpp index cb1b9e493..7dac780ea 100644 --- a/examples/simple1d1.cpp +++ b/examples/simple1d1.cpp @@ -15,9 +15,9 @@ int main(int argc, char* argv[]) Double-precision version (see simple1d1f for single-precision) Compile with: - g++ -fopenmp simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lfftw3_omp -lm + g++ -fopenmp simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 or if you have built a single-core version: - g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lm + g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 Usage: ./simple1d1 */ diff --git a/examples/simple1d1c.c b/examples/simple1d1c.c index b3c718659..6a9ac7e69 100644 --- a/examples/simple1d1c.c +++ b/examples/simple1d1c.c @@ -13,9 +13,9 @@ int main(int argc, char* argv[]) with a math test. Double-precision. C99 style. opts is struct not ptr to it. Compile with: - gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lfftw3_omp -lm -lstdc++ + gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lm -lstdc++ or if you have built a single-core version: - gcc example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lm -lstdc++ + gcc example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lm -lstdc++ Usage: ./example1d1c */ diff --git a/examples/simple1d1cf.c b/examples/simple1d1cf.c index db79c06e1..1bde4af95 100644 --- a/examples/simple1d1cf.c +++ b/examples/simple1d1cf.c @@ -13,9 +13,9 @@ int main(int argc, char* argv[]) with a math test. Single-precision version. C99 style. opts is a struct. Compile with: - gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lfftw3f_omp -lm -lstdc++ + gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lm -lstdc++ or if you have built a single-core version: - gcc example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lm -lstdc++ + gcc example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lm -lstdc++ Usage: ./example1d1cf */ diff --git a/examples/simple1d1f.cpp b/examples/simple1d1f.cpp index fea98b8d6..68f1c0259 100644 --- a/examples/simple1d1f.cpp +++ b/examples/simple1d1f.cpp @@ -15,9 +15,9 @@ int main(int argc, char* argv[]) (See simple1d1 for double-precision version.) Compile with: - g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lfftw3f_omp -lm + g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f or if you have built a single-core version: - g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm + g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f Usage: ./simple1d1f */ diff --git a/examples/simple2d1.cpp b/examples/simple2d1.cpp index cf912445b..b383aca40 100644 --- a/examples/simple2d1.cpp +++ b/examples/simple2d1.cpp @@ -14,9 +14,9 @@ int main(int argc, char *argv[]){ arrays of C++ complex numbers, with a math test. Double precision version. Compile multithreaded with - g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lfftw3_omp -lm + g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 single core with: - g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm + g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 Usage: ./simple2d1 */ diff --git a/examples/simulplans1d1.cpp b/examples/simulplans1d1.cpp index b814876a2..25e824f03 100644 --- a/examples/simulplans1d1.cpp +++ b/examples/simulplans1d1.cpp @@ -34,13 +34,12 @@ double chk1d1(int n, vector& x, vector>& c, int main(int argc, char* argv[]) /* Demo two simultaneous FINUFFT plans (A,B) being handled in C++ without - interacting (or at least without crashing; note that FFTW initialization - is the only global state of FINUFFT library). + interacting. Using STL double complex vectors, with a math test. Edited from guru1d1, Barnett 2/15/22 Compile & run: - g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 -lfftw3 -lfftw3_omp -lm && ./simulplans1d1 + g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 && ./simulplans1d1 */ { double tol = 1e-9; // desired accuracy for both plans diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp index f25f25b8b..0dd4212c4 100644 --- a/examples/threadsafe1d1.cpp +++ b/examples/threadsafe1d1.cpp @@ -15,9 +15,6 @@ int main(int argc, char* argv[]) Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test. Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp. - Notes: You may not have libfftw3_omp, so I have switched to - libfftw3_threads in this suggested compile command: - g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1 Usage: ./threadsafe1d1 diff --git a/fortran/examples/guru1d1.f b/fortran/examples/guru1d1.f index 3f9c66827..3877a5c2c 100755 --- a/fortran/examples/guru1d1.f +++ b/fortran/examples/guru1d1.f @@ -6,7 +6,7 @@ c To compile (linux/GCC) from this directory, use eg (paste to one line): c gfortran -fopenmp -I../../include -I/usr/include guru1d1.f -c ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ -o guru1d1 +c ../../lib/libfinufft.so -lgomp -lstdc++ -o guru1d1 c Alex Barnett and Libin Lu 5/29/20. ptr fixes 10/6/21 @@ -15,8 +15,6 @@ program guru1d1 c our fortran header, always needed include 'finufft.fh' -c if you want to use FFTW's modes by name... - include 'fftw3.f' c note some inputs are int (int*4) but others BIGINT (int*8) integer ier,iflag @@ -41,7 +39,7 @@ program guru1d1 c how many nonuniform pts M = 1000000 -c how many modes (not too much since FFTW_MEASURE slow later) +c how many modes N = 100000 allocate(fk(N)) @@ -104,8 +102,6 @@ program guru1d1 print *,'' print *, 'setting new options, rerun guru interface...' call finufft_default_opts(opts) -c refer to fftw3.f to set various FFTW plan modes... - opts%fftw = FFTW_ESTIMATE_PATIENT opts%debug = 1 c note you need a fresh plan if change opts call finufft_makeplan(ttype,dim,n_modes,iflag,ntrans, diff --git a/fortran/examples/guru1d1f.f b/fortran/examples/guru1d1f.f index 90d43174c..74508feaf 100755 --- a/fortran/examples/guru1d1f.f +++ b/fortran/examples/guru1d1f.f @@ -15,8 +15,6 @@ program guru1d1f c our fortran-header, always needed include 'finufft.fh' -c if you want to use FFTW's modes by name... - include 'fftw3.f' c note some inputs are int (int*4) but others BIGINT (int*8) integer ier,iflag @@ -40,7 +38,7 @@ program guru1d1f c how many nonuniform pts M = 200000 -c how many modes (not too much since FFTW_MEASURE slow later) +c how many modes N = 100000 allocate(fk(N)) @@ -103,8 +101,6 @@ program guru1d1f print *,'' print *, 'setting new options, rerun guru interface...' call finufftf_default_opts(opts) -c refer to fftw3.f to set various FFTW plan modes... - opts%fftw = FFTW_ESTIMATE_PATIENT opts%debug = 1 c note you need a fresh plan if change opts call finufftf_makeplan(ttype,dim,n_modes,iflag,ntrans, diff --git a/fortran/examples/nufft1d_demo.f b/fortran/examples/nufft1d_demo.f index e63a7e434..9b16b266b 100755 --- a/fortran/examples/nufft1d_demo.f +++ b/fortran/examples/nufft1d_demo.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static lib; paste to a single line): c c gfortran nufft1d_demo.f ../directft/dirft1d.f -o nufft1d_demo -c ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp +c ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp c program nufft1d_demo implicit none diff --git a/fortran/examples/nufft1d_demof.f b/fortran/examples/nufft1d_demof.f index 13a40d601..24d7c4797 100755 --- a/fortran/examples/nufft1d_demof.f +++ b/fortran/examples/nufft1d_demof.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static lib; paste to a single line): c c gfortran nufft1d_demof.f ../directft/dirft1df.f -o nufft1d_demof -c ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp +c ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp c program nufft1d_demof implicit none diff --git a/fortran/examples/nufft2d_demo.f b/fortran/examples/nufft2d_demo.f index b37cfeffc..281112e4a 100755 --- a/fortran/examples/nufft2d_demo.f +++ b/fortran/examples/nufft2d_demo.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static lib, paste to a single line): c c gfortran nufft2d_demo.f ../directft/dirft2d.f -o nufft2d_demo -c ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp +c ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp c program nufft2d_demo implicit none diff --git a/fortran/examples/nufft2d_demof.f b/fortran/examples/nufft2d_demof.f index b649f6109..7192274de 100755 --- a/fortran/examples/nufft2d_demof.f +++ b/fortran/examples/nufft2d_demof.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static lib, paste to a single line): c c gfortran nufft2d_demof.f ../directft/dirft2df.f -o nufft2d_demof -c ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp +c ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp c program nufft2d_demof implicit none diff --git a/fortran/examples/nufft2dmany_demo.f b/fortran/examples/nufft2dmany_demo.f index 605237161..7a4928f41 100755 --- a/fortran/examples/nufft2dmany_demo.f +++ b/fortran/examples/nufft2dmany_demo.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded; paste to a single line): c c gfortran nufft2dmany_demo.f ../directft/dirft2d.f -o nufft2dmany_demo -c -L../../lib -lfinufft -lfftw3 -lfftw3_omp -lstdc++ +c -L../../lib -lfinufft -lstdc++ c program nufft2dmany_demo implicit none diff --git a/fortran/examples/nufft3d_demo.f b/fortran/examples/nufft3d_demo.f index af04afa5f..17c4074e6 100755 --- a/fortran/examples/nufft3d_demo.f +++ b/fortran/examples/nufft3d_demo.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static, paste to a single line): c c gfortran nufft3d_demo.f ../directft/dirft3d.f -o nufft3d_demo -c ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp +c ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp c program nufft3d_demo implicit none diff --git a/fortran/examples/nufft3d_demof.f b/fortran/examples/nufft3d_demof.f index 2e5a9e21f..6cc856574 100755 --- a/fortran/examples/nufft3d_demof.f +++ b/fortran/examples/nufft3d_demof.f @@ -12,7 +12,7 @@ c Compile with, eg (GCC, multithreaded, static, paste to a single line): c c gfortran nufft3d_demof.f ../directft/dirft3df.f -o nufft3d_demof -c ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp +c ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp c program nufft3d_demof implicit none diff --git a/fortran/examples/simple1d1.f b/fortran/examples/simple1d1.f index f187806ec..ee9a81f19 100755 --- a/fortran/examples/simple1d1.f +++ b/fortran/examples/simple1d1.f @@ -6,7 +6,7 @@ c To compile (linux/GCC) from this directory, use eg (paste to one line): c gfortran -fopenmp -I../../include simple1d1.f -o simple1d1 -c ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ +c ../../lib/libfinufft.so -lgomp -lstdc++ c Alex Barnett and Libin Lu 5/28/20, fix ptrs 10/6/21 diff --git a/fortran/examples/simple1d1.f90 b/fortran/examples/simple1d1.f90 index e1c6dcc2f..2368ad92f 100755 --- a/fortran/examples/simple1d1.f90 +++ b/fortran/examples/simple1d1.f90 @@ -6,7 +6,7 @@ ! To compile (linux/GCC) from this directory, note the module also has to be ! compiled, eg: -! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ +! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lgomp -lstdc++ ! Alex Barnett, to demo Reinhard Neder f90 module, 1/20/23. diff --git a/include/finufft/defs.h b/include/finufft/defs.h index 77bc69b6b..f06710989 100644 --- a/include/finufft/defs.h +++ b/include/finufft/defs.h @@ -166,10 +166,6 @@ // -------- FINUFFT's plan object, prec-switching version ------------------ // NB: now private (the public C++ or C etc user sees an opaque pointer to it) -// FFTW is needed since we include a FFTW plan in the FINUFFT plan... -#include // (must come after complex.h) -// (other FFT lib headers eg MKL could be here...) - // group together a bunch of type 3 rescaling/centering/phasing parameters: #define TYPE3PARAMS FINUFFTIFY(_type3Params) typedef struct { @@ -186,7 +182,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++ BIGINT nj; // num of NU pts in type 1,2 (for type 3, num input x pts) BIGINT nk; // number of NU freq pts (type 3 only) FLT tol; // relative user tolerance - int batchSize; // # strength vectors to group together for FFTW, etc + int batchSize; // # strength vectors to group together for FFT, etc int nbatch; // how many batches done to cover all ntrans vectors BIGINT ms; // number of modes in x (1) dir (historical CMCL name) = N1 @@ -205,8 +201,7 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++ FLT* phiHat2; // " y-axis. FLT* phiHat3; // " z-axis. - FFTW_CPX* fwBatch; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array + CPX* fwBatch; // FIXME: UNUSED, kept for layout compatibility BIGINT *sortIndices; // precomputed NU pt permutation, speeds spread/interp bool didSort; // whether binsorting used (false: identity perm used) @@ -218,13 +213,12 @@ typedef struct FINUFFT_PLAN_S { // the main plan object, fully C++ FLT *S, *T, *U; // pointers to user's target NU pts arrays (no new allocs) CPX* prephase; // pre-phase, for all input NU pts CPX* deconv; // reciprocal of kernel FT, phase, all output NU pts - CPX* CpBatch; // working array of prephased strengths + CPX* CpBatch; // FIXME: UNUSED, kept for layout compatibility FLT *Sp, *Tp, *Up; // internal primed targs (s'_k, etc), allocated TYPE3PARAMS t3P; // groups together type 3 shift, scale, phase, parameters FINUFFT_PLAN innerT2plan; // ptr used for type 2 in step 2 of type 3 // other internal structs; each is C-compatible of course - FFTW_PLAN fftwPlan; finufft_opts opts; // this and spopts could be made ptrs finufft_spread_opts spopts; diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h deleted file mode 100644 index 89d86f0de..000000000 --- a/include/finufft/fftw_defs.h +++ /dev/null @@ -1,48 +0,0 @@ -// all FFTW-related private FINUFFT headers - -#ifndef FFTW_DEFS_H -#define FFTW_DEFS_H - -// Here we define typedefs and MACROS to switch between single and double -// precision library compilation, which need different FFTW command symbols. -// Barnett simplified via FFTWIFY, 6/7/22. - -#include // (after complex.h) needed so can typedef FFTW_CPX - -// precision-switching names for interfaces to FFTW... -#ifdef SINGLE - // macro to prepend fftw_ (for double) or fftwf_ (for single) to a string - // without a space. The 2nd level of indirection is needed for safety, see: - // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting - #define FFTWIFY_UNSAFE(x) fftwf_##x -#else - #define FFTWIFY_UNSAFE(x) fftw_##x -#endif -#define FFTWIFY(x) FFTWIFY_UNSAFE(x) -// now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros): -#define FFTW_CPX FFTWIFY(complex) -#define FFTW_PLAN FFTWIFY(plan) -#define FFTW_ALLOC_RE FFTWIFY(alloc_real) -#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex) -#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d) -#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d) -#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d) -#define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft) -#define FFTW_EX FFTWIFY(execute) -#define FFTW_DE FFTWIFY(destroy_plan) -#define FFTW_FR FFTWIFY(free) -#define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom) -#define FFTW_CLEANUP FFTWIFY(cleanup) -// the following OMP switch could be done in the src code instead... -#ifdef _OPENMP - #define FFTW_INIT FFTWIFY(init_threads) - #define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads) - #define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads) -#else - // no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls... - #define FFTW_INIT() - #define FFTW_PLAN_TH(x) - #define FFTW_CLEANUP_THREADS() -#endif - -#endif // FFTW_DEFS_H diff --git a/include/finufft/finufft_eitherprec.h b/include/finufft/finufft_eitherprec.h index f46272011..abbf7edfb 100644 --- a/include/finufft/finufft_eitherprec.h +++ b/include/finufft/finufft_eitherprec.h @@ -118,5 +118,3 @@ typedef struct FINUFFT_PLAN_S * FINUFFT_PLAN; #undef FINUFFT_PLAN #undef FINUFFT_PLAN_S #undef FINUFFT_TYPE3PARAMS -#undef FINUFFT_FFTW_CPX -#undef FINUFFT_FFTW_PLAN diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index 54b058266..cf53bb699 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -20,9 +20,6 @@ #include // prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc... #include -// since "many" (vector) tests need direct access to FFTW commands... -// (although this now happens to be included in defs.h too) -#include // std stuff for tester src #include diff --git a/include/finufft_opts.h b/include/finufft_opts.h index 3a0156000..3d4da53af 100644 --- a/include/finufft_opts.h +++ b/include/finufft_opts.h @@ -20,7 +20,7 @@ typedef struct finufft_opts{ // defaults see finufft.cpp:finufft_default_opts() // algorithm performance opts... int nthreads; // number of threads to use, or 0 uses all available - int fftw; // plan flags to FFTW (FFTW_ESTIMATE=64, FFTW_MEASURE=0,...) + int fftw; // FIXME: UNUSED, kept for layout compatibility int spread_sort; // spreader: 0 don't sort, 1 do, or 2 heuristic choice int spread_kerevalmeth; // spreader: 0 exp(sqrt()), 1 Horner piecewise poly (faster) int spread_kerpad; // (exp(sqrt()) only): 0 don't pad kernel to 4n, 1 do diff --git a/make.inc.macosx_arm64 b/make.inc.macosx_arm64 index 8889fb964..50c42aee9 100644 --- a/make.inc.macosx_arm64 +++ b/make.inc.macosx_arm64 @@ -31,14 +31,11 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib -L/opt/homebrew/opt/libomp/lib # OpenMP with clang needs following... OMPFLAGS = -Xpreprocessor -fopenmp OMPLIBS = -L/usr/local/lib -L/opt/homebrew/lib -lomp -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: this will probably segfault. Instead we suggest you use # make.inc.macosx_clang_matlab -# Some of these will depend on your FFTW library location... MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm # may need to edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex diff --git a/make.inc.macosx_arm64_matlab2022b_beta b/make.inc.macosx_arm64_matlab2022b_beta index 762e55c3b..b8f685f77 100644 --- a/make.inc.macosx_arm64_matlab2022b_beta +++ b/make.inc.macosx_arm64_matlab2022b_beta @@ -37,12 +37,9 @@ OMPLIBS = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/toolbox/eml/extern # we need to use -Wl,-rpath to add iomp lib directory to the runtime library search path # add iomp runtime search path to linker flags LDFLAGS += -Wl,-rpath,$(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/toolbox/eml/externalDependency/omp/maca64/lib/ -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: -# some of these will depend on your FFTW library location, but this is where -# brew should put things... +# this is where brew should put things... MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm # should work, or edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/bin/mex @@ -52,5 +49,3 @@ MEX = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/bin/mex # extras by Wallace Chen: LDFLAGS += -L/opt/homebrew/opt/libomp/lib CXXFLAGS += -I/opt/homebrew/opt/libomp/include -LDFLAGS += -L/opt/homebrew/opt/fftw/lib -CXXFLAGS += -I/opt/homebrew/opt/fftw/include diff --git a/make.inc.macosx_clang b/make.inc.macosx_clang index 9ca4734d5..35149a26d 100644 --- a/make.inc.macosx_clang +++ b/make.inc.macosx_clang @@ -31,14 +31,11 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib # OpenMP with clang needs following... OMPFLAGS = -Xpreprocessor -fopenmp OMPLIBS = -L/usr/local/lib -L/usr/local/opt/libomp/lib -lomp -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: this will probably segfault. Instead we suggest you use # make.inc.macosx_clang_matlab -# Some of these will depend on your FFTW library location... MFLAGS += -I/usr/local/include -L/usr/local/lib -lm # may need to edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex diff --git a/make.inc.macosx_clang_matlab b/make.inc.macosx_clang_matlab index c856d537b..e7853d6b1 100644 --- a/make.inc.macosx_clang_matlab +++ b/make.inc.macosx_clang_matlab @@ -34,12 +34,9 @@ OMPLIBS = $(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/libiomp5.d # we need to use -Wl,-rpath to add iomp lib directory to the runtime library search path # add iomp runtime search path to linker flags LDFLAGS += -Wl,-rpath,$(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/ -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: -# some of these will depend on your FFTW library location, but this is where -# brew should put things... +# this is where brew should put things... MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm # should work, or edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex diff --git a/make.inc.macosx_gcc-10 b/make.inc.macosx_gcc-10 index 6329483ca..10edcc478 100644 --- a/make.inc.macosx_gcc-10 +++ b/make.inc.macosx_gcc-10 @@ -32,11 +32,8 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib # OpenMP with GCC on OSX needs following... OMPFLAGS = -fopenmp OMPLIBS = -L/usr/local/lib -lgomp -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: -# some of these will depend on your FFTW library location... MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm # edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex diff --git a/make.inc.macosx_gcc-8 b/make.inc.macosx_gcc-8 index 464c524c8..ac4a8b3ba 100644 --- a/make.inc.macosx_gcc-8 +++ b/make.inc.macosx_gcc-8 @@ -33,11 +33,8 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib # OpenMP with GCC on OSX needs following... OMPFLAGS = -fopenmp OMPLIBS = -L/usr/local/lib -lgomp -# since fftw3_omp doesn't work in OSX, we need... -FFTWOMPSUFFIX=threads # MATLAB interface: -# some of these will depend on your FFTW library location... MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm # edit for your MATLAB version location... MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex diff --git a/make.inc.windows_mingw b/make.inc.windows_mingw index e37a1436b..f712bf5fc 100644 --- a/make.inc.windows_mingw +++ b/make.inc.windows_mingw @@ -1,22 +1,9 @@ MINGW=ON # libm not available on Windows? Has to be removed from LIBS to build MATLAB mex file. Does not interfere with library build LIBS= -# please set these paths -FFTW_H_DIR= -FFTW_LIB_DIR= # might be needed for MATLAB LGOMP_DIR= -# modify FLAGS such that FFTW headers are included -ifneq ($(FFTW_H_DIR),) -CFLAGS+=-I$(FFTW_H_DIR) -CXXFLAGS+=-I$(FFTW_H_DIR) -endif -# add FFTW DLL location to LIBS -ifneq ($(FFTW_LIB_DIR),) -LIBS+=-L$(FFTW_LIB_DIR) -endif - # adjust MATLAB flags, add path of lgomp ifneq ($(FFTW_H_DIR),) MFLAGS=-I$(FFTW_H_DIR) -largeArrayDims diff --git a/makefile b/makefile index 96a7a596a..31256b911 100644 --- a/makefile +++ b/makefile @@ -18,7 +18,7 @@ CXX = g++ CC = gcc FC = gfortran -CLINK = -lstdc++ +CLINK = -lstdc++ -lm FLINK = $(CLINK) # Python version: we use python3 by default, but you may need to change... PYTHON = python3 @@ -29,11 +29,7 @@ PYTHON = python3 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range $(CFLAGS) FFLAGS := $(CFLAGS) $(FFLAGS) CXXFLAGS := $(CFLAGS) $(CXXFLAGS) -# FFTW base name, and math linking... -FFTWNAME = fftw3 -# linux default is fftw3_omp, since 10% faster than fftw3_threads... -FFTWOMPSUFFIX = omp -LIBS := -lm +LIBS := # multithreading for GCC: C++/C/Fortran, MATLAB, and octave (ICC differs)... OMPFLAGS = -fopenmp OMPLIBS = -lgomp @@ -59,15 +55,10 @@ FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST)))) # Now come flags that should be added, whatever user overrode in make.inc. # -fPIC (position-indep code) needed to build dyn lib (.so) # Also, we force return (via :=) to the land of simply-expanded variables... -INCL = -Iinclude -CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++14 +INCL = -Iinclude -Icontrib +CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++17 CFLAGS := $(CFLAGS) $(INCL) -fPIC -# here /usr/include needed for fftw3.f "fortran header"... (JiriK: no longer) -FFLAGS := $(FFLAGS) $(INCL) -I/usr/include -fPIC - -# single-thread total list of math and FFTW libs (now both precisions)... -# (Note: finufft tests use LIBSFFT; spread & util tests only need LIBS) -LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)f $(LIBS) +FFLAGS := $(FFLAGS) $(INCL) -fPIC # multi-threaded libs & flags, and req'd flags (OO for new interface)... ifneq ($(OMP),OFF) @@ -77,8 +68,6 @@ ifneq ($(OMP),OFF) MFLAGS += $(MOMPFLAGS) -DR2008OO OFLAGS += $(OOMPFLAGS) -DR2008OO LIBS += $(OMPLIBS) -# omp override for total list of math and FFTW libs (now both precisions)... - LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)_$(FFTWOMPSUFFIX) -l$(FFTWNAME)f -l$(FFTWNAME)f_$(FFTWOMPSUFFIX) $(LIBS) endif # name & location of library we're building... @@ -108,7 +97,7 @@ OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o # their single-prec versions OBJSF = $(OBJS:%.o=%_32.o) # precision-dependent library object files (compiled & linked only once)... -OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o +OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o contrib/ducc0/infra/string_utils.o contrib/ducc0/infra/threading.o # all lib dual-precision objs OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI) @@ -129,7 +118,7 @@ usage: @echo " make octave - compile and test octave interfaces" @echo " make python - compile and test python interfaces" @echo " make all - do all the above (around 1 minute; assumes you have MATLAB, etc)" - @echo " make spreadtest - compile & run spreader-only tests (no FFTW)" + @echo " make spreadtest - compile & run spreader-only tests (no FFT)" @echo " make spreadtestall - small set spreader-only tests for CI use" @echo " make objclean - remove all object files, preserving libs & MEX" @echo " make clean - also remove all lib, MEX, py, and demo executables" @@ -175,14 +164,14 @@ endif $(DYNLIB): $(OBJSD) # using *absolute* path in the -o here is needed to make portable executables # when compiled against it, in mac OSX, strangely... - $(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBSFFT) + $(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBS) ifeq ($(OMP),OFF) @echo "$(DYNLIB) built, single-thread version" else @echo "$(DYNLIB) built, multithreaded version" endif -# here $(OMPFLAGS) and $(LIBSFFT) is even needed for linking under mac osx. +# here $(OMPFLAGS) is even needed for linking under mac osx. # see: http://www.cprogramming.com/tutorial/shared-libraries-linux-gcc.html # Also note -l libs come after objects, as per modern GCC requirement. @@ -210,19 +199,19 @@ endif examples/%: examples/%.o $(DYNLIB) $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) -o $@ examples/%c: examples/%c.o $(DYNLIB) - $(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) $(CLINK) -o $@ + $(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(CLINK) -o $@ examples/%cf: examples/%cf.o $(DYNLIB) - $(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) $(CLINK) -o $@ + $(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(CLINK) -o $@ # test (library validation) -------------------------------------------------- # build (skipping .o) but don't run. Run with 'test' target # Note: both precisions use same sources; single-prec executables get f suffix. -# generic tests link against our .so... (other libs needed for fftw_forget...) +# generic tests link against our .so... test/%: test/%.cpp $(DYNLIB) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) -o $@ + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBS) -o $@ test/%f: test/%.cpp $(DYNLIB) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBS) -o $@ # low-level tests that are cleaner if depend on only specific objects... test/testutils: test/testutils.cpp src/utils.o src/utils_precindep.o $(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o src/utils_precindep.o $(LIBS) -o test/testutils @@ -263,9 +252,9 @@ endif # perftest (performance/developer tests) ------------------------------------- # generic perf test rules... perftest/%: perftest/%.cpp $(DYNLIB) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) -o $@ + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBS) -o $@ perftest/%f: perftest/%.cpp $(DYNLIB) - $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@ + $(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBS) -o $@ # spreader only test, double/single (good for self-contained work on spreader) ST=perftest/spreadtestnd @@ -310,7 +299,7 @@ gurutime: $(GTT) $(GTTF) # This was for a CCQ application... (zgemm was 10x faster! double-prec only) perftest/manysmallprobs: perftest/manysmallprobs.cpp $(STATICLIB) - $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(STATICLIB) $(LIBSFFT) -o $@ + $(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(STATICLIB) -o $@ @echo "manysmallprobs: single-thread..." OMP_NUM_THREADS=1 $@ @@ -349,11 +338,11 @@ fortran: $(FE) # matlab ---------------------------------------------------------------------- # matlab .mex* executable... (matlab is so slow to start, not worth testing it) matlab: matlab/finufft.cpp $(STATICLIB) - $(MEX) $< $(STATICLIB) $(INCL) $(MFLAGS) $(LIBSFFT) -output matlab/finufft + $(MEX) $< $(STATICLIB) $(INCL) $(MFLAGS) -output matlab/finufft # octave .mex executable... octave: matlab/finufft.cpp $(STATICLIB) - (cd matlab; $(MKOCTFILE) --mex finufft.cpp -I../include ../$(STATICLIB) $(OFLAGS) $(LIBSFFT) -output finufft) + (cd matlab; $(MKOCTFILE) --mex finufft.cpp -I../include ../$(STATICLIB) $(OFLAGS) -output finufft) @echo "Running octave interface tests; please wait a few seconds..." (cd matlab ;\ $(OCTAVE) test/check_finufft.m ;\ @@ -374,7 +363,7 @@ endif # python --------------------------------------------------------------------- python: $(STATICLIB) $(DYNLIB) - FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install -e ./python/finufft + FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages -e ./python/finufft # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps $(PYTHON) python/finufft/test/run_accuracy_tests.py $(PYTHON) python/finufft/examples/simple1d1.py @@ -435,7 +424,7 @@ endif objclean: ifneq ($(MINGW),ON) # non-Windows-WSL... - rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o + rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o contrib/ducc0/infra/*.o rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o finufft_mod.mod else # Windows-WSL... diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp index 9524b4dda..f7a271c6a 100644 --- a/perftest/guru_timing_test.cpp +++ b/perftest/guru_timing_test.cpp @@ -134,9 +134,6 @@ int main(int argc, char* argv[]) } // Andrea found the following are needed to get reliable independent timings: - FFTW_CLEANUP(); - FFTW_CLEANUP_THREADS(); - FFTW_FORGET_WISDOM(); //std::this_thread::sleep_for(std::chrono::seconds(1)); sleep(tsleep); @@ -187,14 +184,6 @@ int main(int argc, char* argv[]) // Comparing timing results with repeated calls to corresponding finufft function... - // The following would normally be done between independent timings, as found - // by Andrea Malleo, but in this case we need to access the plan later - // for many_simple_calls() to work, so we cannot do FFTW cleanup without - // apparently causing segfault :(. So we skip them. - //FFTW_CLEANUP(); - //FFTW_CLEANUP_THREADS(); - //FFTW_FORGET_WISDOM(); - //std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed sleep(tsleep); //sleep for one second using linux sleep call diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index c6776cf0e..1cf578cf8 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -16,7 +16,7 @@ int main(int argc, char* argv[]) for Xi Chen question. Updated to also demo guru interface and compare speed. 6/7/22 made deterministic changes so check answer matches both ways. - g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs -lfftw3 -lfftw3_omp -lm + g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs # multithreaded is much slower, due to overhead of starting threads?... export OMP_NUM_THREADS=1 time ./manysmallprobs diff --git a/perftest/timingBreakdowns.py b/perftest/timingBreakdowns.py index 3abf92e17..33f6fe12c 100644 --- a/perftest/timingBreakdowns.py +++ b/perftest/timingBreakdowns.py @@ -123,7 +123,7 @@ totalTimeT3_Old.append(totalOldTime) #total time speedup - totalSpeedup = round(totalOldTime/totalNewTime,5) + totalSpeedup = round(totalOldTime/max(1e-19,totalNewTime),5) if(ftype == 1): totalTimeT1Ratio.append(totalSpeedup) @@ -201,7 +201,7 @@ totalOldfftwPlan = round(totalOldfftwPlan,5) #These plan ratios include the initial old implementation plan construction!! - fftwPlanRatio = round(totalOldfftwPlan/new_fftwPlan,5) + fftwPlanRatio = round(totalOldfftwPlan/max(1e-19,new_fftwPlan),5) if(ftype == 1): fftwPlanT1Ratio.append(fftwPlanRatio) @@ -228,7 +228,7 @@ #collect the fftw_exec timings for each trial of old totalOldfft = stm.sumAllTime("(.*fft \(\d+ threads\))(.*)",strOut) - fftRatio = round(totalOldfft/new_fft,5) + fftRatio = round(totalOldfft/max(1e-19,new_fft),5) if(ftype == 1): fftT1Ratio.append(fftRatio) @@ -302,11 +302,11 @@ ax1 = fig.add_subplot(221,projection='3d') if(totalTimeT1Ratio): - ax1.bar3d(t1x, t1y, zbot, widths, depths, totalTimeT1Ratio, shade=True, color='r', label='type1', alpha='1') + ax1.bar3d(t1x, t1y, zbot, widths, depths, totalTimeT1Ratio, shade=True, color='r', label='type1', alpha=1) if(totalTimeT2Ratio): - ax1.bar3d(t2x, t2y, zbot, widths, depths, totalTimeT2Ratio, shade=True, color='b', label='type2', alpha='1') + ax1.bar3d(t2x, t2y, zbot, widths, depths, totalTimeT2Ratio, shade=True, color='b', label='type2', alpha=1) if(totalTimeT3Ratio): - ax1.bar3d(t3x, t3y, zbot, widths, depths, totalTimeT3Ratio, shade=True, color='g', label='type3', alpha='1') + ax1.bar3d(t3x, t3y, zbot, widths, depths, totalTimeT3Ratio, shade=True, color='g', label='type3', alpha=1) ax1.legend([t1_proxy,t2_proxy,t3_proxy], ['type1','type2','type3']) diff --git a/src/finufft.cpp b/src/finufft.cpp index 696500b76..57d864b98 100644 --- a/src/finufft.cpp +++ b/src/finufft.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include @@ -17,6 +16,8 @@ #include #include #include "../contrib/legendre_rule_fast.h" +#include "ducc0/fft/fft.h" +#include "ducc0/fft/fftnd_impl.h" using namespace std; using namespace finufft; @@ -45,8 +46,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: 3) deconvolve by division of each Fourier mode independently by the kernel Fourier series coeffs (not merely FFT of kernel), shuffle to output. The kernel coeffs are precomputed in what is called step 0 in the code. - Written with FFTW style complex arrays. Step 3a internally uses CPX, - and Step 3b internally uses real arithmetic and FFTW style complex. TYPE 2: The type 2 algorithm proceeds in three main steps: @@ -54,8 +53,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: 2) compute inverse FFT on uniform fine grid 3) spread (dir=2, ie interpolate) data to regular mesh The kernel coeffs are precomputed in what is called step 0 in the code. - Written with FFTW style complex arrays. Step 0 internally uses CPX, - and Step 1 internally uses real arithmetic and FFTW style complex. TYPE 3: The type 3 algorithm is basically a type 2 (which is implemented precisely @@ -69,7 +66,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: using quadrature of the kernel function times exponentials. iii) Shifts in x (real) and s (Fourier) are done to minimize the interval half-widths X and S, hence nf1. - No references to FFTW are needed here. CPX arithmetic is used. MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1): maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so @@ -83,7 +79,7 @@ Design notes for guru interface implementation: since that would only survive in the scope of each function. * Thread-safety: FINUFFT plans are passed as pointers, so it has no global - state apart from that associated with FFTW (and the did_fftw_init). + state, */ @@ -208,7 +204,7 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts) FLT f[MAX_NQUAD]; double z[2*MAX_NQUAD], w[2*MAX_NQUAD]; legendre_compute_glr(2*q,z,w); // only half the nodes used, eg on (0,1) - std::complex a[MAX_NQUAD]; + CPX a[MAX_NQUAD]; for (int n=0;n aj[MAX_NQUAD]; // phase rotator for this thread + CPX aj[MAX_NQUAD]; // phase rotator for this thread for (int n=0;nfwBatch, using the same set of + to (or from) the batch of fine working grids fwBatch, using the same set of (index-sorted) NU points p->X,Y,Z for each vector in the batch. The direction (spread vs interpolate) is set by p->spopts.spread_direction. Returns 0 (no error reporting for now). @@ -435,7 +428,7 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch) #endif #pragma omp parallel for num_threads(nthr_outer) for (int i=0; ifwBatch + i*p->nf; // start of i'th fw array in wkspace + CPX *fwi = fwBatch + i*p->nf; // start of i'th fw array in wkspace CPX *ci = cBatch + i*p->nj; // start of i'th c array in cBatch spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT*)fwi, p->nj, p->X, p->Y, p->Z, (FLT*)ci, p->spopts, p->didSort); @@ -443,12 +436,12 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch) return 0; } -int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) +int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX * fwBatch, CPX* fkBatch) /* - Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch + Type 1: deconvolves (amplifies) from each interior fw array in fwBatch into each output array fk in fkBatch. Type 2: deconvolves from user-supplied input fk to 0-padded interior fw, - again looping over fk in fkBatch and fw in p->fwBatch. + again looping over fk in fkBatch and fw in fwBatch. The direction (spread vs interpolate) is set by p->spopts.spread_direction. This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize times. @@ -458,7 +451,7 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) // since deconvolveshuffle?d are single-thread, omp par seems to help here... #pragma omp parallel for num_threads(batchSize) for (int i=0; ifwBatch + i*p->nf; // start of i'th fw array in wkspace + CPX *fwi = fwBatch + i*p->nf; // start of i'th fw array in wkspace CPX *fki = fkBatch + i*p->N; // start of i'th fk array in fkBatch // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... @@ -482,14 +475,14 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch) // since this func is local only, we macro its name here... #ifdef SINGLE -#define GRIDSIZE_FOR_FFTW gridsize_for_fftwf +#define GRIDSIZE_FOR_FFT gridsize_for_fftf #else -#define GRIDSIZE_FOR_FFTW gridsize_for_fftw +#define GRIDSIZE_FOR_FFT gridsize_for_fft #endif -int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){ +int* GRIDSIZE_FOR_FFT(FINUFFT_PLAN p){ // local helper func returns a new int array of length dim, extracted from -// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument. +// the finufft plan, that is needed for calling FFTs. int* nf; if(p->dim == 1){ nf = new int[1]; @@ -499,7 +492,7 @@ int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){ nf = new int[2]; nf[0] = (int)p->nf2; nf[1] = (int)p->nf1; - } // fftw enforced row major ordering, ie dims are backwards ordered + } // use row major ordering, ie dims are backwards ordered else{ nf = new int[3]; nf[0] = (int)p->nf3; @@ -537,7 +530,7 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o) o->showwarn = 1; o->nthreads = 0; - o->fftw = FFTW_ESTIMATE; + o->fftw = 0; // FIXME: unused o->spread_sort = 2; o->spread_kerevalmeth = 1; o->spread_kerpad = 1; @@ -555,8 +548,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, // Populates the fields of finufft_plan which is pointed to by "p". // opts is ptr to a finufft_opts to set options, or NULL to use defaults. // For some of the fields, if "auto" selected, choose the actual setting. -// For types 1,2 allocates memory for internal working arrays, -// evaluates spreading kernel coefficients, and instantiates the fftw_plan +// For types 1,2 allocates memory for internal working arrays, and +// evaluates spreading kernel coefficients { FINUFFT_PLAN p; cout << scientific << setprecision(15); // for commented-out low-lev debug @@ -646,20 +639,6 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, // ------------------------ types 1,2: planning needed --------------------- if (type==1 || type==2) { - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) - // Note: batchSize not used since might be only 1. - // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT - // thread-safe (can be called inside OMP) - { - static bool did_fftw_init = false; // the only global state of FINUFFT - std::lock_guard lock(fftw_lock); - if (!did_fftw_init) { - FFTW_INIT(); // setup FFTW global state; should only do once - FFTW_PLAN_TH(nthr_fft); // ditto - did_fftw_init = true; // ensure other FINUFFT threads don't clash - } - } - p->spopts.spread_direction = type; if (p->opts.showwarn) { // user warn round-off error... @@ -704,44 +683,20 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag, if (dim>2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); if (p->opts.debug) printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n",__func__,p->spopts.nspread, timer.elapsedsec()); - timer.restart(); p->nf = p->nf1*p->nf2*p->nf3; // fine grid total number of points if (p->nf * p->batchSize > MAX_NF) { fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__); return FINUFFT_ERR_MAXNALLOC; } - p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec()); - if(!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__); - free(p->phiHat1); free(p->phiHat2); free(p->phiHat3); - return FINUFFT_ERR_ALLOC; - } - - timer.restart(); // plan the FFTW - int *ns = GRIDSIZE_FOR_FFTW(p); - // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags - { - std::lock_guard lock(fftw_lock); - p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf, - p->fftSign, p->opts.fftw); - } - if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec()); - delete []ns; - } else { // -------------------------- type 3 (no planning) ------------ if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n",__func__,dim,type,ntrans); // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->CpBatch = NULL; - p->fwBatch = NULL; p->Sp = NULL; p->Tp = NULL; p->Up = NULL; p->prephase = NULL; p->deconv = NULL; p->innerT2plan = NULL; - // Type 3 will call finufft_makeplan for type 2; no need to init FFTW - // Note we don't even know nj or nk yet, so can't do anything else! } return ier; // report setup_spreader status (could be warning) } @@ -826,19 +781,6 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj, fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__); return FINUFFT_ERR_MAXNALLOC; } - if (p->fwBatch) - FFTW_FR(p->fwBatch); - p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace - - // (note FFTW_ALLOC is not needed over malloc, but matches its type) - if(p->CpBatch) free(p->CpBatch); - p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize); // batch c' work - if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec()); - if(!p->fwBatch || !p->CpBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__); - return FINUFFT_ERR_ALLOC; - } - //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... if(p->X) free(p->X); @@ -988,13 +930,14 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ existing (sorted) NU pts and existing plan. For type 1 and 3: cj is input, fk is output. For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate + Performs spread/interp, pre/post deconvolve, and FFTs as appropriate for each of the 3 types. For cases of ntrans>1, performs work in blocks of size up to batchSize. Return value 0 (no error diagnosis yet). Barnett 5/20/20, based on Malleo 2019. */ CNTime timer; timer.start(); + std::vector fwBatch(p->nf * p->batchSize); // the big workspace if (p->type!=3){ // --------------------- TYPE 1,2 EXEC ------------------ @@ -1014,27 +957,110 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ // STEP 1: (varies by type) timer.restart(); if (p->type == 1) { // type 1: spread NU pts p->X, weights cj, to fw grid - spreadinterpSortedBatch(thisBatchSize, p, cjb); + spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), cjb); t_sprint += timer.elapsedsec(); } else { // type 2: amplify Fourier coeffs fk into 0-padded fw - deconvolveBatch(thisBatchSize, p, fkb); + deconvolveBatch(thisBatchSize, p, fwBatch.data(), fkb); t_deconv += timer.elapsedsec(); } - // STEP 2: call the pre-planned FFT on this batch + // STEP 2: call the FFT on this batch timer.restart(); - FFTW_EX(p->fftwPlan); // if thisBatchSize arrdims, axes; + arrdims.push_back(size_t(p->batchSize)); + arrdims.push_back(size_t(ns[0])); axes.push_back(1); + if (p->dim>=2) { arrdims.push_back(size_t(ns[1])); axes.push_back(2); } + if (p->dim>=3) { arrdims.push_back(size_t(ns[2])); axes.push_back(3); } + ducc0::vfmav data(fwBatch.data(), arrdims); + if (p->dim==1) // 1D: no chance for FFT shortcuts + ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads); + else if (p->dim==2) // 2D: do partial FFTs + { + if (p->ms<2) // something is weird, do standard FFT + ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads); + else + { + size_t y_lo = size_t((p->ms+1)/2); + size_t y_hi = size_t(ns[1]-p->ms/2); + auto sub1 = ducc0::subarray(data, {{},{},{0,y_lo}}); + auto sub2 = ducc0::subarray(data, {{},{},{y_hi,ducc0::MAXIDX}}); + if (p->type == 1) // spreading, not all parts of the output array are needed + { + // do axis 2 in full + ducc0::c2c(data, data, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do only parts of axis 1 + ducc0::c2c(sub1, sub1, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub2, sub2, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + } + else // interpolation, parts of the input array are zero + { + // do only parts of axis 1 + ducc0::c2c(sub1, sub1, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub2, sub2, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do axis 2 in full + ducc0::c2c(data, data, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + } + } + } + else // 3D + { + if ((p->ms<2) || (p->mt<2)) // something is weird, do standard FFT + ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads); + else + { + size_t z_lo = size_t((p->ms+1)/2); + size_t z_hi = size_t(ns[2]-p->ms/2); + size_t y_lo = size_t((p->mt+1)/2); + size_t y_hi = size_t(ns[1]-p->mt/2); + auto sub1 = ducc0::subarray(data, {{},{},{},{0,z_lo}}); + auto sub2 = ducc0::subarray(data, {{},{},{},{z_hi,ducc0::MAXIDX}}); + auto sub3 = ducc0::subarray(sub1, {{},{},{0,y_lo},{}}); + auto sub4 = ducc0::subarray(sub1, {{},{},{y_hi, ducc0::MAXIDX},{}}); + auto sub5 = ducc0::subarray(sub2, {{},{},{0,y_lo},{}}); + auto sub6 = ducc0::subarray(sub2, {{},{},{y_hi, ducc0::MAXIDX},{}}); + if (p->type == 1) // spreading, not all parts of the output array are needed + { + // do axis 3 in full + ducc0::c2c(data, data, {3}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do only parts of axis 2 + ducc0::c2c(sub1, sub1, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do even smaller parts of axis 1 + ducc0::c2c(sub3, sub3, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub4, sub4, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub5, sub5, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub6, sub6, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + } + else // interpolation, parts of the input array are zero + { + // do even smaller parts of axis 1 + ducc0::c2c(sub3, sub3, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub4, sub4, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub5, sub5, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub6, sub6, {1}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do only parts of axis 2 + ducc0::c2c(sub1, sub1, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + ducc0::c2c(sub2, sub2, {2}, p->fftSign<0, FLT(1), p->opts.nthreads); + // do axis 3 in full + ducc0::c2c(data, data, {3}, p->fftSign<0, FLT(1), p->opts.nthreads); + } + } + } + delete[] ns; + } t_fft += timer.elapsedsec(); if (p->opts.debug>1) - printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec()); + printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec()); // STEP 3: (varies by type) timer.restart(); if (p->type == 1) { // type 1: deconvolve (amplify) fw and shuffle to fk - deconvolveBatch(thisBatchSize, p, fkb); + deconvolveBatch(thisBatchSize, p, fwBatch.data(), fkb); t_deconv += timer.elapsedsec(); } else { // type 2: interpolate unif fw grid to NU target pts - spreadinterpSortedBatch(thisBatchSize, p, cjb); + spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), cjb); t_sprint += timer.elapsedsec(); } } // ........end b loop @@ -1060,6 +1086,8 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ if (p->opts.debug) printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n",__func__,p->ntrans, p->nbatch, p->batchSize); + std::vector CpBatch(p->nj*p->batchSize); // batch c' work + for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches // batching and pointers to this batch, identical to t1,2 above... @@ -1075,25 +1103,25 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ for (int i=0; inj; for (BIGINT j=0;jnj;++j) - p->CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j]; + CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j]; } t_pre += timer.elapsedsec(); // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid... timer.restart(); p->spopts.spread_direction = 1; // spread - spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch); // p->X are primed + spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), CpBatch.data()); // p->X are primed t_spr += timer.elapsedsec(); - //for (int j=0;jnf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]); // debug + //for (int j=0;jnf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,fwBatch[j].real(),fwBatch[j].imag()); // debug // STEP 2: type 2 NUFFT from fw batch to user output fk array batch... timer.restart(); // illegal possible shrink of ntrans *after* plan for smaller last batch: p->innerT2plan->ntrans = thisBatchSize; // do not try this at home! - /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array + /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array still the same size, as Andrea explained; just wastes a few flops) */ - FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX*)(p->fwBatch)); + FINUFFT_EXECUTE(p->innerT2plan, fkb, fwBatch.data()); t_t2 += timer.elapsedsec(); // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... @@ -1115,7 +1143,7 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){ } } //for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long int)k,(double)real(fk[k]),(double)imag(fk[k])); // debug - + return 0; } @@ -1130,19 +1158,13 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p) if (!p) // NULL ptr, so not a ptr to a plan, report error return 1; - FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array free(p->sortIndices); if (p->type==1 || p->type==2) { - { - std::lock_guard lock(fftw_lock); - FFTW_DE(p->fftwPlan); - } free(p->phiHat1); free(p->phiHat2); free(p->phiHat3); } else { // free the stuff alloc for type 3 only FINUFFT_DESTROY(p->innerT2plan); // if NULL, ignore its error code - free(p->CpBatch); free(p->Sp); free(p->Tp); free(p->Up); free(p->X); free(p->Y); free(p->Z); free(p->prephase); diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp index ba3a3f328..830fa9011 100644 --- a/test/dumbinputs.cpp +++ b/test/dumbinputs.cpp @@ -23,11 +23,11 @@ Made pass-fail, obviating results/dumbinputs.refout. Barnett 6/16/23. Suggested compile: - g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lfftw3_omp -lm - g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE + g++ -std=c++17 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs + g++ -std=c++17 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -DSINGLE or if you have built a single-core version: - g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lm + g++ -std=c++17 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs etc */ diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp index 8dd345b1a..90a0aaaba 100644 --- a/test/finufft1d_test.cpp +++ b/test/finufft1d_test.cpp @@ -19,7 +19,6 @@ int main(int argc, char* argv[]) double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); // put defaults in opts - // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE int isign = +1; // choose which exponential sign to test if (argc<3 || argc>8) { for (int i=0; help[i]; ++i) diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp index 581c52c2d..b2cff678a 100644 --- a/test/finufft1dmany_test.cpp +++ b/test/finufft1dmany_test.cpp @@ -20,7 +20,6 @@ int main(int argc, char* argv[]) double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); - // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE int isign = +1; // choose which exponential sign to test if (argc<4 || argc>11) { for (int i=0; help[i]; ++i) @@ -81,7 +80,6 @@ int main(int argc, char* argv[]) (long long)nt1,i,err); // compare the result with FINUFFT1D1 - FFTW_FORGET_WISDOM(); CPX * F_1d1 = (CPX *)malloc(sizeof(CPX)*N*ntransf); CPX * Fstart; CPX * cstart; @@ -112,7 +110,6 @@ int main(int argc, char* argv[]) printf("test 1d2 many vs repeated single: ------------------------------------\n"); - FFTW_FORGET_WISDOM(); #pragma omp parallel { @@ -141,7 +138,6 @@ int main(int argc, char* argv[]) printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err); // check against single calls to FINUFFT1D2... - FFTW_FORGET_WISDOM(); CPX * c_1d2 = (CPX *)malloc(sizeof(CPX)*M*ntransf); timer.restart(); for(BIGINT j = 0; j < ntransf; j++){ @@ -165,7 +161,6 @@ int main(int argc, char* argv[]) free(c_1d2); printf("test 1d3 many vs repeated single: ------------------------------------\n"); - FFTW_FORGET_WISDOM(); #pragma omp parallel { @@ -206,7 +201,6 @@ int main(int argc, char* argv[]) printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err); // compare the result with single calls to FINUFFT1D3... - FFTW_FORGET_WISDOM(); CPX *f_1d3 = (CPX *)malloc(sizeof(CPX)*N*ntransf); timer.restart(); for(int k = 0; k < ntransf; k++){ diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp index 04945b5f9..f2e02aaa6 100644 --- a/test/finufft2d_test.cpp +++ b/test/finufft2d_test.cpp @@ -19,7 +19,6 @@ int main(int argc, char* argv[]) double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); - // opts.fftw = FFTW_MEASURE; // change from usual FFTW_ESTIMATE int isign = +1; // choose which exponential sign to test if (argc<4 || argc>9) { for (int i=0; help[i]; ++i) diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp index 31b65378e..e79a8613c 100644 --- a/test/finufft2dmany_test.cpp +++ b/test/finufft2dmany_test.cpp @@ -20,7 +20,6 @@ int main(int argc, char* argv[]) double w, tol = 1e-6; // default double err, errfail = INFINITY, errmax = 0; finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts); - //opts.fftw = FFTW_MEASURE; // change from default FFTW_ESTIMATE int isign = +1; // choose which exponential sign to test if (argc<5 || argc>12) { for (int i=0; help[i]; ++i) @@ -85,7 +84,6 @@ int main(int argc, char* argv[]) (long long)nt1,(long long)nt2,i,err); // compare the result with FINUFFT2D1 - FFTW_FORGET_WISDOM(); finufft_opts simpleopts = opts; simpleopts.debug = 0; // don't output timing for calls of FINUFFT2D1 simpleopts.spread_debug = 0; @@ -125,7 +123,6 @@ int main(int argc, char* argv[]) for (BIGINT m=0; m13) { for (int i=0; help[i]; ++i) @@ -89,7 +88,6 @@ int main(int argc, char* argv[]) (long long)nt1,(long long)nt2,(long long)nt3,i,err); // compare the result with FINUFFT3D1 - FFTW_FORGET_WISDOM(); finufft_opts simpleopts=opts; simpleopts.debug = 0; // don't output timing for calls of FINUFFT3D1 simpleopts.spread_debug = 0; @@ -128,7 +126,6 @@ int main(int argc, char* argv[]) #pragma omp for schedule(static,TEST_RANDCHUNK) for (BIGINT m=0; m