diff --git a/.github/workflows/C++.yml b/.github/workflows/C++.yml
index 31d8b203b..f65200b6b 100644
--- a/.github/workflows/C++.yml
+++ b/.github/workflows/C++.yml
@@ -18,10 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Install fftw
-      run: |
-        yum install -y fftw3-devel
-
     - name: Compile C++ code
       run: |
         make spreadtestall
@@ -36,9 +32,9 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Install omp and fftw
+    - name: Install omp
       run: |
-        brew install libomp fftw
+        brew install libomp
         cp make.inc.macosx_clang make.inc
 
     - name: Compile C++ code
@@ -55,9 +51,9 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Install gcc and fftw
+    - name: Install gcc
       run: |
-        brew install gcc@10 fftw
+        brew install gcc@10
         cp make.inc.macosx_gcc-10 make.inc
 
     - name: Compile C++ code
@@ -86,7 +82,6 @@ jobs:
           diffutils
         pacboy: >-
           toolchain:p
-          fftw:p
     - name: Compile C++ code
       run: |
         cp make.inc.windows_msys make.inc
diff --git a/.github/workflows/python_build_win.ps1 b/.github/workflows/python_build_win.ps1
index 072413d42..5b4444c4d 100644
--- a/.github/workflows/python_build_win.ps1
+++ b/.github/workflows/python_build_win.ps1
@@ -39,10 +39,6 @@ Copy-Item -Path C:\msys64\mingw64\bin\libstdc++-*.dll -Destination ([IO.Path]::C
 Copy-Item -Path C:\msys64\mingw64\bin\libgcc_s_seh-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 Copy-Item -Path C:\msys64\mingw64\bin\libgomp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 Copy-Item -Path C:\msys64\mingw64\bin\libwinpthread-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
-Copy-Item -Path C:\msys64\mingw64\bin\libfftw3-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
-Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
-Copy-Item -Path C:\msys64\mingw64\bin\libfftw3_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
-Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 New-Item -Path .\wheelhouse -ItemType Directory -Force
 wheel.exe pack $unpacked_wheel -d .\wheelhouse
 if (-not $?) {throw "Failed pack wheel"}
diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml
index fd0db91bb..102d33c4b 100644
--- a/.github/workflows/python_wheel.yml
+++ b/.github/workflows/python_wheel.yml
@@ -18,10 +18,6 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Install fftw
-      run: |
-        yum install -y fftw3-devel
-
     - name: Install ffi
       run: |
         yum install -y libffi-devel
@@ -44,9 +40,9 @@ jobs:
     steps:
     - uses: actions/checkout@v2
 
-    - name: Install gcc and fftw
+    - name: Install gcc
       run: |
-        brew install gcc fftw
+        brew install gcc
         cp make.inc.macosx_gcc-8 make.inc
         echo "FC=gfortran-11" >> make.inc
         echo "CC=gcc-11" >> make.inc
@@ -169,7 +165,7 @@ jobs:
     - uses: actions/checkout@v2
 
     - name: Install GCC and make
-      run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain mingw-w64-x86_64-fftw"
+      run: C:\msys64\usr\bin\bash.exe -lc "pacman -Sy --noconfirm make mingw-w64-x86_64-toolchain"
 
     - name: Build and Test Python 3.8
       uses: actions/setup-python@v2
diff --git a/.travis.yml b/.travis.yml
index cd0187140..9a15984f2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -12,7 +12,6 @@ addons:
     homebrew:
         packages:
             - ccache
-            - fftw
             - libomp
         update: true
 cache: ccache
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9e3ee23a7..a917d4f4b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.19)
 
 project(finufft VERSION 2.2.0 LANGUAGES C CXX)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+
 set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
 if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
     # Set custom compiler flags for gcc-compatible compilers
@@ -27,7 +30,7 @@ option(FINUFFT_BUILD_TESTS "Whether to build the FINUFFT tests" OFF)
 option(FINUFFT_BUILD_FORTRAN "Whether to build the FINUFFT Fortran examples" OFF)
 option(FINUFFT_BUILD_MATLAB "Whether to build the FINUFFT Matlab interface" OFF)
 option(FINUFFT_ENABLE_SANITIZERS "Whether to enable sanitizers, only effective for Debug configuration." ON)
-option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded. This does not affect the choice of FFTW library." ON)
+option(FINUFFT_USE_OPENMP "Whether to use OpenMP for parallelization. If disabled, the finufft library will be single threaded." ON)
 option(FINUFFT_USE_CUDA "Whether to build CUDA accelerated FINUFFT library (libcufinufft). This is completely independent of the main FINUFFT library" OFF)
 option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinufft)." ON)
 # sphinx tag (don't remove): @cmake_opts_end
@@ -35,9 +38,6 @@ option(FINUFFT_USE_CPU "Whether to build the ordinary FINUFFT library (libfinuff
 if(FINUFFT_USE_CPU)
     set(CPM_DOWNLOAD_VERSION 0.38.0)
     include(cmake/setupCPM.cmake)
-
-    set(FFTW_VERSION 3.3.10)
-    include(cmake/setupFFTW.cmake)
 endif()
 
 if (FINUFFT_BUILD_MATLAB)
@@ -87,7 +87,8 @@ endfunction()
 # Utility function to set finufft compilation options.
 function(set_finufft_options target)
     set_property(TARGET ${target} PROPERTY POSITION_INDEPENDENT_CODE ON)
-    set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 14)
+    set_property(TARGET ${target} PROPERTY CMAKE_CXX_STANDARD 17)
+    set_property(TARGET ${target} PROPERTY CMAKE_CUDA_STANDARD 17)
     enable_asan(${target})
 
     target_compile_options(${target} PRIVATE SHELL:$<$<CONFIG:Release,RelWithDebInfo>:${FINUFFT_ARCH_FLAGS}>)
@@ -96,7 +97,7 @@ function(set_finufft_options target)
         target_compile_options(${target} PRIVATE $<$<CONFIG:Release,RelWithDebInfo>:-fcx-limited-range>)
     endif ()
 
-    target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
+    target_include_directories(${target} PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/contrib")
     if (FINUFFT_USE_OPENMP)
         target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX)
         # there are issues on windows with OpenMP and CMake, so we need to manually add the flags
@@ -111,17 +112,6 @@ function(set_finufft_options target)
         endif ()
     endif ()
 
-    # FFTW CMAKE file includes the APIs only as an install target, so we need to manually
-    # include them since we need them for build not for install
-    # trying to include them directly into the fftw and fftwf targets causes issues with
-    # the latest version of cmake, so we do it here instead.
-    if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
-        list (GET FINUFFT_FFTW_LIBRARIES 0 element)
-        get_property(FFTW_SOURCE_DIR TARGET ${element} PROPERTY SOURCE_DIR)
-        set(FFTW_INCLUDE_DIR ${FFTW_SOURCE_DIR}/api)
-        target_include_directories(${target} PUBLIC ${FFTW_INCLUDE_DIR})
-    endif()
-
 endfunction()
 
 if(FINUFFT_USE_CPU)
@@ -129,13 +119,13 @@ if(FINUFFT_USE_CPU)
     add_library(finufft_f32 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
     target_compile_definitions(finufft_f32 PRIVATE SINGLE)
     set_finufft_options(finufft_f32)
-    target_link_libraries(finufft_f32 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
+    target_link_libraries(finufft_f32 PUBLIC)
 
     add_library(finufft_f64 OBJECT ${FINUFFT_PRECISION_DEPENDENT_SOURCES})
     set_finufft_options(finufft_f64)
-    target_link_libraries(finufft_f64 PUBLIC ${FINUFFT_FFTW_LIBRARIES})
+    target_link_libraries(finufft_f64 PUBLIC)
 
-    add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
+    add_library(finufft SHARED src/utils_precindep.cpp contrib/legendre_rule_fast.cpp contrib/ducc0/infra/string_utils.cc contrib/ducc0/infra/threading.cc)
     set_finufft_options(finufft)
     target_link_libraries(finufft PUBLIC finufft_f32 finufft_f64)
     # windows does not have a math library, so we need to exclude it
@@ -144,7 +134,7 @@ if(FINUFFT_USE_CPU)
     endif()
     target_include_directories(finufft PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include")
 
-    add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp)
+    add_library(finufft_static STATIC src/utils_precindep.cpp contrib/legendre_rule_fast.cpp contrib/ducc0/infra/string_utils.cc contrib/ducc0/infra/threading.cc)
     set_finufft_options(finufft)
     target_link_libraries(finufft_static PUBLIC finufft_f32 finufft_f64)
     # windows does not have a math library, so we need to exclude it
@@ -161,6 +151,7 @@ if(FINUFFT_USE_CUDA)
     set(CMAKE_CUDA_ARCHITECTURES "60;70;75" CACHE STRING "" FORCE)
   endif()
   enable_language(CUDA)
+  set(CMAKE_CUDA_STANDARD 17)
   find_package(CUDAToolkit REQUIRED)
   add_subdirectory(src/cuda)
   if (BUILD_TESTING AND FINUFFT_BUILD_TESTS)
diff --git a/CMakePresets.json b/CMakePresets.json
index 2363692b1..1cc13a3eb 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -48,10 +48,9 @@
             "name": "singlethreaded",
             "binaryDir": "build/singlethreaded",
             "displayName": "singlethreaded",
-            "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFTW",
+            "description": "Configuration for single-threaded build. Disables OpenMP for finufft and FFT",
             "inherits": "default",
             "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "",
                 "FINUFFT_USE_OPENMP": "OFF"
             }
         },
@@ -89,7 +88,6 @@
             "description": "Build with the matlab interface",
             "generator": "Ninja Multi-Config",
             "cacheVariables": {
-                "FINUFFT_FFTW_SUFFIX": "Threads",
                 "FINUFFT_BUILD_MATLAB": "ON",
                 "FINUFFT_ENABLE_SANITIZERS": "OFF"
             }
diff --git a/cmake/setupFFTW.cmake b/cmake/setupFFTW.cmake
deleted file mode 100644
index 9a7f8c44d..000000000
--- a/cmake/setupFFTW.cmake
+++ /dev/null
@@ -1,62 +0,0 @@
-CPMAddPackage(
-    NAME findfftw
-    GIT_REPOSITORY "https://github.com/egpbos/findFFTW.git"
-    GIT_TAG "master"
-    EXCLUDE_FROM_ALL YES
-    GIT_SHALLOW YES
-)
-
-list(APPEND CMAKE_MODULE_PATH "${findfftw_SOURCE_DIR}")
-
-if (FINUFFT_FFTW_LIBRARIES STREQUAL DEFAULT OR FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD)
-    find_package(FFTW)
-    if ( (NOT FFTW_FOUND ) OR (FINUFFT_FFTW_LIBRARIES STREQUAL DOWNLOAD))
-        if (FINUFFT_FFTW_SUFFIX STREQUAL THREADS) 
-            set(FINUFFT_USE_THREADS ON)
-        else()
-            set(FINUFFT_USE_THREADS OFF)
-        endif()
-            CPMAddPackage(
-                    NAME fftw3
-                    OPTIONS 
-                        "ENABLE_AVX2 ON"
-                        "BUILD_TESTS OFF"
-                        "BUILD_SHARED_LIBS OFF"
-                        "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
-                        "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-                    URL "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-                    URL_HASH "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-                    EXCLUDE_FROM_ALL YES
-                    GIT_SHALLOW YES
-            )
-
-            CPMAddPackage(
-                    NAME fftw3f
-                    OPTIONS 
-                        "ENABLE_AVX2 ON"
-                        "BUILD_TESTS OFF"
-                        "BUILD_SHARED_LIBS OFF"
-                        "ENABLE_FLOAT ON"
-                        "ENABLE_THREADS ${FINUFFT_USE_THREADS}"
-                        "ENABLE_OPENMP ${FINUFFT_USE_OPENMP}"
-                    URL "http://www.fftw.org/fftw-${FFTW_VERSION}.tar.gz"
-                    URL_HASH "MD5=8ccbf6a5ea78a16dbc3e1306e234cc5c"
-                    EXCLUDE_FROM_ALL YES
-                    GIT_SHALLOW YES
-            )
-
-            set(FINUFFT_FFTW_LIBRARIES fftw3 fftw3f)
-            if (FINUFFT_USE_THREADS)
-                list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_threads fftw3f_threads)
-            elseif (FINUFFT_USE_OPENMP)
-                list(APPEND FINUFFT_FFTW_LIBRARIES fftw3_omp fftw3f_omp)
-            endif ()
-            
-            foreach (element IN LISTS FINUFFT_FFTW_LIBRARIES)
-                set_property(TARGET ${element} PROPERTY POSITION_INDEPENDENT_CODE ON)
-            endforeach ()
-
-    else ()
-        set(FINUFFT_FFTW_LIBRARIES "FFTW::Float" "FFTW::Double" "FFTW::Float${FINUFFT_FFTW_SUFFIX}" "FFTW::Double${FINUFFT_FFTW_SUFFIX}")
-    endif ()
-endif ()
\ No newline at end of file
diff --git a/contrib/ducc0/fft/fft.h b/contrib/ducc0/fft/fft.h
new file mode 100644
index 000000000..be270639c
--- /dev/null
+++ b/contrib/ducc0/fft/fft.h
@@ -0,0 +1,982 @@
+/*
+This file is part of the ducc FFT library
+
+Copyright (C) 2010-2023 Max-Planck-Society
+Copyright (C) 2019 Peter Bell
+
+Authors: Martin Reinecke, Peter Bell
+*/
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_FFT_H
+#define DUCC0_FFT_H
+
+#include <cstddef>
+#include <typeindex>
+#include <memory>
+#include <vector>
+#include <complex>
+#include "ducc0/infra/error_handling.h"
+#include "ducc0/infra/aligned_array.h"
+#include "ducc0/infra/mav.h"
+#include "ducc0/math/cmplx.h"
+#include "ducc0/math/unity_roots.h"
+
+namespace ducc0 {
+
+namespace detail_fft {
+
+using namespace std;
+
+template<typename T> using Troots = shared_ptr<const UnityRoots<T,Cmplx<T>>>;
+template<typename T> inline auto tidx() { return type_index(typeid(T)); }
+
+template<typename T> inline void PM(T &a, T &b, T c, T d)
+  { a=c+d; b=c-d; }
+template<typename T> inline void PMINPLACE(T &a, T &b)
+  { T t = a; a+=b; b=t-b; }
+template<typename T> inline void MPINPLACE(T &a, T &b)
+  { T t = a; a-=b; b=t+b; }
+template<bool fwd, typename T, typename T2> void special_mul (const Cmplx<T> &v1, const Cmplx<T2> &v2, Cmplx<T> &res)
+  {
+  res = fwd ? Cmplx<T>(v1.r*v2.r+v1.i*v2.i, v1.i*v2.r-v1.r*v2.i)
+            : Cmplx<T>(v1.r*v2.r-v1.i*v2.i, v1.r*v2.i+v1.i*v2.r);
+  }
+
+struct util1d // hack to avoid duplicate symbols
+  {
+  /* returns the smallest composite of 2, 3, 5, 7 and 11 which is >= n */
+  DUCC0_NOINLINE static size_t good_size_cmplx(size_t n)
+    {
+    if (n<=12) return n;
+
+    size_t bestfac=2*n;
+    for (size_t f11=1; f11<bestfac; f11*=11)
+      for (size_t f117=f11; f117<bestfac; f117*=7)
+        for (size_t f1175=f117; f1175<bestfac; f1175*=5)
+          {
+          size_t x=f1175;
+          while (x<n) x*=2;
+          for (;;)
+            {
+            if (x<n)
+              x*=3;
+            else if (x>n)
+              {
+              if (x<bestfac) bestfac=x;
+              if (x&1) break;
+              x>>=1;
+              }
+            else
+              return n;
+            }
+          }
+    return bestfac;
+    }
+
+  /* returns the smallest composite of 2, 3, 5 which is >= n */
+  DUCC0_NOINLINE static size_t good_size_real(size_t n)
+    {
+    if (n<=6) return n;
+
+    size_t bestfac=2*n;
+    for (size_t f5=1; f5<bestfac; f5*=5)
+      {
+      size_t x = f5;
+      while (x<n) x *= 2;
+      for (;;)
+        {
+        if (x<n)
+          x*=3;
+        else if (x>n)
+          {
+          if (x<bestfac) bestfac=x;
+          if (x&1) break;
+          x>>=1;
+          }
+        else
+          return n;
+        }
+      }
+    return bestfac;
+    }
+
+  DUCC0_NOINLINE static vector<size_t> prime_factors(size_t N)
+    {
+    MR_assert(N>0, "need a positive number");
+    vector<size_t> factors;
+    while ((N&1)==0)
+      { N>>=1; factors.push_back(2); }
+    for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
+    while ((N%divisor)==0)
+      {
+      factors.push_back(divisor);
+      N/=divisor;
+      }
+    if (N>1) factors.push_back(N);
+    return factors;
+    }
+  };
+
+// T: "type", f/c: "float/complex", s/v: "scalar/vector"
+template <typename Tfs> class cfftpass
+  {
+  public:
+    virtual ~cfftpass(){}
+    using Tcs = Cmplx<Tfs>;
+
+    // number of Tcd values required as scratch space during "exec"
+    // will be provided in "buf"
+    virtual size_t bufsize() const = 0;
+    virtual bool needs_copy() const = 0;
+    virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
+      bool fwd, size_t nthreads=1) const = 0;
+
+    static vector<size_t> factorize(size_t N)
+      {
+      MR_assert(N>0, "need a positive number");
+      vector<size_t> factors;
+      factors.reserve(15);
+      while ((N&7)==0)
+        { factors.push_back(8); N>>=3; }
+      while ((N&3)==0)
+        { factors.push_back(4); N>>=2; }
+      if ((N&1)==0)
+        {
+        N>>=1;
+        // factor 2 should be at the front of the factor list
+        factors.push_back(2);
+        swap(factors[0], factors.back());
+        }
+      for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
+      while ((N%divisor)==0)
+        {
+        factors.push_back(divisor);
+        N/=divisor;
+        }
+      if (N>1) factors.push_back(N);
+      return factors;
+      }
+
+    static shared_ptr<cfftpass> make_pass(size_t l1, size_t ido, size_t ip,
+      const Troots<Tfs> &roots, bool vectorize=false);
+    static shared_ptr<cfftpass> make_pass(size_t ip, bool vectorize=false)
+      {
+      return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip),
+        vectorize);
+      }
+  };
+
+template <typename Tfs> class rfftpass
+  {
+  public:
+    virtual ~rfftpass(){}
+
+    // number of Tfd values required as scratch space during "exec"
+    // will be provided in "buf"
+    virtual size_t bufsize() const = 0;
+    virtual bool needs_copy() const = 0;
+    virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
+      bool fwd, size_t nthreads=1) const = 0;
+
+    static vector<size_t> factorize(size_t N)
+      {
+      MR_assert(N>0, "need a positive number");
+      vector<size_t> factors;
+      while ((N&3)==0)
+        { factors.push_back(4); N>>=2; }
+      if ((N&1)==0)
+        {
+        N>>=1;
+        // factor 2 should be at the front of the factor list
+        factors.push_back(2);
+        swap(factors[0], factors.back());
+        }
+      for (size_t divisor=3; divisor*divisor<=N; divisor+=2)
+      while ((N%divisor)==0)
+        {
+        factors.push_back(divisor);
+        N/=divisor;
+        }
+      if (N>1) factors.push_back(N);
+      return factors;
+      }
+
+    static shared_ptr<rfftpass> make_pass(size_t l1, size_t ido, size_t ip,
+       const Troots<Tfs> &roots, bool vectorize=false);
+    static shared_ptr<rfftpass> make_pass(size_t ip, bool vectorize=false)
+      {
+      return make_pass(1,1,ip,make_shared<UnityRoots<Tfs,Cmplx<Tfs>>>(ip),
+        vectorize);
+      }
+  };
+
+template<typename T> using Tcpass = shared_ptr<cfftpass<T>>;
+template<typename T> using Trpass = shared_ptr<rfftpass<T>>;
+
+template<typename Tfs> class pocketfft_c
+  {
+  private:
+    size_t N;
+    size_t critbuf;
+    Tcpass<Tfs> plan;
+
+  public:
+    pocketfft_c(size_t n, bool vectorize=false)
+      : N(n), critbuf(((N&1023)==0) ? 16 : 0),
+        plan(cfftpass<Tfs>::make_pass(n,vectorize)) {}
+    size_t length() const { return N; }
+    size_t bufsize() const { return N*plan->needs_copy()+2*critbuf+plan->bufsize(); }
+    template<typename Tfd> DUCC0_NOINLINE Cmplx<Tfd> *exec(Cmplx<Tfd> *in, Cmplx<Tfd> *buf,
+      Tfs fct, bool fwd, size_t nthreads=1) const
+      {
+      static const auto tic = tidx<Cmplx<Tfd> *>();
+      auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic,
+        in, buf+critbuf+plan->bufsize(), buf+critbuf, fwd, nthreads));
+      if (fct!=Tfs(1))
+        for (size_t i=0; i<N; ++i) res[i]*=fct;
+      return res;
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Cmplx<Tfd> *in, Cmplx<Tfd> *buf,
+      Tfs fct, bool fwd, size_t nthreads=1) const
+      {
+      static const auto tic = tidx<Cmplx<Tfd> *>();
+      auto res = static_cast<Cmplx<Tfd> *>(plan->exec(tic,
+        in, buf, buf+N*plan->needs_copy(), fwd, nthreads));
+      if (res==in)
+        {
+        if (fct!=Tfs(1))
+          for (size_t i=0; i<N; ++i) in[i]*=fct;
+        }
+      else
+        {
+        if (fct!=Tfs(1))
+          for (size_t i=0; i<N; ++i) in[i]=res[i]*fct;
+        else
+          copy_n(res, N, in);
+        }
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec(Cmplx<Tfd> *in, Tfs fct, bool fwd, size_t nthreads=1) const
+      {
+      aligned_array<Cmplx<Tfd>> buf(N*plan->needs_copy()+plan->bufsize());
+      exec_copyback(in, buf.data(), fct, fwd, nthreads);
+      }
+  };
+
+template<typename Tfs> class pocketfft_r
+  {
+  private:
+    size_t N;
+    Trpass<Tfs> plan;
+
+  public:
+    pocketfft_r(size_t n, bool vectorize=false)
+      : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
+    size_t length() const { return N; }
+    size_t bufsize() const { return N*plan->needs_copy()+plan->bufsize(); }
+    template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
+      bool fwd, size_t nthreads=1) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf,
+        buf+N*plan->needs_copy(), fwd, nthreads));
+      if (fct!=Tfs(1))
+        for (size_t i=0; i<N; ++i) res[i]*=fct;
+      return res;
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
+      Tfs fct, bool fwd, size_t nthreads=1) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      auto res = static_cast<Tfd *>(plan->exec(tifd, in, buf,
+        buf+N*plan->needs_copy(), fwd, nthreads));
+      if (res==in)
+        {
+        if (fct!=Tfs(1))
+          for (size_t i=0; i<N; ++i) in[i]*=fct;
+        }
+      else
+        {
+        if (fct!=Tfs(1))
+          for (size_t i=0; i<N; ++i) in[i]=res[i]*fct;
+        else
+          copy_n(res, N, in);
+        }
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd,
+      size_t nthreads=1) const
+      {
+      aligned_array<Tfd> buf(N*plan->needs_copy()+plan->bufsize());
+      exec_copyback(in, buf.data(), fct, fwd, nthreads);
+      }
+  };
+
+template<typename Tfs> class pocketfft_hartley
+  {
+  private:
+    size_t N;
+    Trpass<Tfs> plan;
+
+  public:
+    pocketfft_hartley(size_t n, bool vectorize=false)
+      : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
+    size_t length() const { return N; }
+    size_t bufsize() const { return N+plan->bufsize(); }
+    template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
+      size_t nthreads=1) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      auto res = static_cast<Tfd *>(plan->exec(tifd,
+        in, buf, buf+N, true, nthreads));
+      auto res2 = (res==buf) ? in : buf;
+      res2[0] = fct*res[0];
+      size_t i=1, i1=1, i2=N-1;
+      for (i=1; i<N-1; i+=2, ++i1, --i2)
+        {
+        res2[i1] = fct*(res[i]+res[i+1]);
+        res2[i2] = fct*(res[i]-res[i+1]);
+        }
+      if (i<N)
+        res2[i1] = fct*res[i];
+
+      return res2;
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
+      Tfs fct, size_t nthreads=1) const
+      {
+      auto res = exec(in, buf, fct, nthreads);
+      if (res!=in)
+        copy_n(res, N, in);
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct,
+      size_t nthreads=1) const
+      {
+      aligned_array<Tfd> buf(N+plan->bufsize());
+      exec_copyback(in, buf.data(), fct, nthreads);
+      }
+  };
+
+template<typename Tfs> class pocketfft_fht
+  {
+  private:
+    size_t N;
+    Trpass<Tfs> plan;
+
+  public:
+    pocketfft_fht(size_t n, bool vectorize=false)
+      : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
+    size_t length() const { return N; }
+    size_t bufsize() const { return N+plan->bufsize(); }
+    template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
+      size_t nthreads=1) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      auto res = static_cast<Tfd *>(plan->exec(tifd,
+        in, buf, buf+N, true, nthreads));
+      auto res2 = (res==buf) ? in : buf;
+      res2[0] = fct*res[0];
+      size_t i=1, i1=1, i2=N-1;
+      for (i=1; i<N-1; i+=2, ++i1, --i2)
+        {
+        res2[i1] = fct*(res[i]-res[i+1]);
+        res2[i2] = fct*(res[i]+res[i+1]);
+        }
+      if (i<N)
+        res2[i1] = fct*res[i];
+
+      return res2;
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
+      Tfs fct, size_t nthreads=1) const
+      {
+      auto res = exec(in, buf, fct, nthreads);
+      if (res!=in)
+        copy_n(res, N, in);
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct,
+      size_t nthreads=1) const
+      {
+      aligned_array<Tfd> buf(N+plan->bufsize());
+      exec_copyback(in, buf.data(), fct, nthreads);
+      }
+  };
+
+// R2R transforms using FFTW's halfcomplex format
+template<typename Tfs> class pocketfft_fftw
+  {
+  private:
+    size_t N;
+    Trpass<Tfs> plan;
+
+  public:
+    pocketfft_fftw(size_t n, bool vectorize=false)
+      : N(n), plan(rfftpass<Tfs>::make_pass(n,vectorize)) {}
+    size_t length() const { return N; }
+    size_t bufsize() const { return N+plan->bufsize(); }
+    template<typename Tfd> DUCC0_NOINLINE Tfd *exec(Tfd *in, Tfd *buf, Tfs fct,
+      bool fwd, size_t nthreads=1) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      auto res = in;
+      auto res2 = buf;
+      if (!fwd) // go to FFTPACK halfcomplex order
+        {
+        res2[0] = fct*res[0];
+        size_t i=1, i1=1, i2=N-1;
+        for (i=1; i<N-1; i+=2, ++i1, --i2)
+          {
+          res2[i] = fct*res[i1];
+          res2[i+1] = fct*res[i2];
+          }
+        if (i<N)
+          res2[i] = fct*res[i1];
+        swap(res, res2);
+        }
+      res = static_cast<Tfd *>(plan->exec(tifd,
+        res, res2, buf+N, fwd, nthreads));
+      if (!fwd) return res;
+
+      // go to FFTW halfcomplex order
+      res2 = (res==buf) ? in : buf;
+      res2[0] = fct*res[0];
+      size_t i=1, i1=1, i2=N-1;
+      for (i=1; i<N-1; i+=2, ++i1, --i2)
+        {
+        res2[i1] = fct*res[i];
+        res2[i2] = fct*res[i+1];
+        }
+      if (i<N)
+        res2[i1] = fct*res[i];
+
+      return res2;
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec_copyback(Tfd *in, Tfd *buf,
+      Tfs fct, bool fwd, size_t nthreads=1) const
+      {
+      auto res = exec(in, buf, fct, fwd, nthreads);
+      if (res!=in)
+        copy_n(res, N, in);
+      }
+    template<typename Tfd> DUCC0_NOINLINE void exec(Tfd *in, Tfs fct, bool fwd,
+      size_t nthreads=1) const
+      {
+      aligned_array<Tfd> buf(N+plan->bufsize());
+      exec_copyback(in, buf.data(), fct, fwd, nthreads);
+      }
+  };
+
+//
+// sine/cosine transforms
+//
+
+template<typename T0> class T_dct1
+  {
+  private:
+    pocketfft_r<T0> fftplan;
+
+  public:
+    DUCC0_NOINLINE T_dct1(size_t length, bool /*vectorize*/=false)
+      : fftplan(2*(length-1)) {}
+
+    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
+      int /*type*/, bool /*cosine*/, size_t nthreads=1) const
+      {
+      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
+      size_t N=fftplan.length(), n=N/2+1;
+      if (ortho)
+        { c[0]*=sqrt2; c[n-1]*=sqrt2; }
+      auto tmp=&buf[0];
+      tmp[0] = c[0];
+      for (size_t i=1; i<n; ++i)
+        tmp[i] = tmp[N-i] = c[i];
+      auto res = fftplan.exec(tmp, &buf[N], fct, true, nthreads);
+      c[0] = res[0];
+      for (size_t i=1; i<n; ++i)
+        c[i] = res[2*i-1];
+      if (ortho)
+        { c[0]*=sqrt2*T0(0.5); c[n-1]*=sqrt2*T0(0.5); }
+      return c;
+      }
+    template<typename T> DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct, bool ortho,
+      int /*type*/, bool /*cosine*/, size_t nthreads=1) const
+      {
+      exec(c, buf, fct, ortho, 1, true, nthreads);
+      }
+    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
+      int /*type*/, bool /*cosine*/, size_t nthreads=1) const
+      {
+      aligned_array<T> buf(bufsize());
+      exec_copyback(c, buf.data(), fct, ortho, 1, true, nthreads);
+      }
+
+    size_t length() const { return fftplan.length()/2+1; }
+    size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
+  };
+
+template<typename T0> class T_dst1
+  {
+  private:
+    pocketfft_r<T0> fftplan;
+
+  public:
+    DUCC0_NOINLINE T_dst1(size_t length, bool /*vectorize*/=false)
+      : fftplan(2*(length+1)) {}
+
+    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const
+      {
+      size_t N=fftplan.length(), n=N/2-1;
+      auto tmp = &buf[0];
+      tmp[0] = tmp[n+1] = c[0]*0;
+      for (size_t i=0; i<n; ++i)
+        { tmp[i+1]=c[i]; tmp[N-1-i]=-c[i]; }
+      auto res = fftplan.exec(tmp, buf+N, fct, true, nthreads);
+      for (size_t i=0; i<n; ++i)
+        c[i] = -res[2*i+2];
+      return c;
+      }
+    template<typename T> DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads=1) const
+      {
+      exec(c, buf, fct, true, 1, false, nthreads);
+      }
+    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool /*cosine*/, size_t nthreads) const
+      {
+      aligned_array<T> buf(bufsize());
+      exec_copyback(c, buf.data(), fct, true, 1, false, nthreads);
+      }
+
+    size_t length() const { return fftplan.length()/2-1; }
+    size_t bufsize() const { return fftplan.length()+fftplan.bufsize(); }
+  };
+
+template<typename T0> class T_dcst23
+  {
+  private:
+    pocketfft_r<T0> fftplan;
+    std::vector<T0> twiddle;
+
+  public:
+    DUCC0_NOINLINE T_dcst23(size_t length, bool /*vectorize*/=false)
+      : fftplan(length), twiddle(length)
+      {
+      UnityRoots<T0,Cmplx<T0>> tw(4*length);
+      for (size_t i=0; i<length; ++i)
+        twiddle[i] = tw[i+1].r;
+      }
+
+    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct, bool ortho,
+      int type, bool cosine, size_t nthreads=1) const
+      {
+      constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
+      size_t N=length();
+      size_t NS2 = (N+1)/2;
+      if (type==2)
+        {
+        c[0] *= 2;
+        if ((N&1)==0) c[N-1]*=2;
+        if (cosine)
+          for (size_t k=1; k<N-1; k+=2)
+            MPINPLACE(c[k+1], c[k]);
+        else
+          for (size_t k=1; k<N-1; k+=2)
+            PMINPLACE(c[k+1], c[k]);
+        if ((!cosine) && ((N&1)==0))
+          c[N-1] *= -1;
+        auto res = fftplan.exec(c, buf, fct, false, nthreads);
+        c[0] = res[0];
+        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
+          {
+          T t1 = twiddle[k-1]*res[kc]+twiddle[kc-1]*res[k];
+          T t2 = twiddle[k-1]*res[k]-twiddle[kc-1]*res[kc];
+          c[k] = T0(0.5)*(t1+t2); c[kc]=T0(0.5)*(t1-t2);
+          }
+        if ((N&1)==0)
+          c[NS2] = res[NS2]*twiddle[NS2-1];
+        if (!cosine)  // swap order completely
+          for (size_t k=0, kc=N-1; k<kc; ++k, --kc)
+            std::swap(c[k], c[kc]);
+        if (ortho)
+          cosine ? c[0]*=sqrt2*T0(0.5) : c[N-1]*=sqrt2*T0(0.5);
+        }
+      else
+        {
+        if (ortho)
+          cosine ? c[0]*=sqrt2 : c[N-1]*=sqrt2;
+        if (!cosine)  // swap order completely
+          for (size_t k=0, kc=N-1; k<NS2; ++k, --kc)
+            std::swap(c[k], c[kc]);
+        for (size_t k=1, kc=N-1; k<NS2; ++k, --kc)
+          {
+          T t1=c[k]+c[kc], t2=c[k]-c[kc];
+          c[k] = twiddle[k-1]*t2+twiddle[kc-1]*t1;
+          c[kc]= twiddle[k-1]*t1-twiddle[kc-1]*t2;
+          }
+        if ((N&1)==0)
+          c[NS2] *= 2*twiddle[NS2-1];
+        auto res = fftplan.exec(c, buf, fct, true, nthreads);
+        if (res != c) // FIXME: not yet optimal
+          copy_n(res, N, c);
+        if ((!cosine) && ((N&1)==0))
+          c[N-1] *= -1;
+        if (cosine)
+          for (size_t k=1; k<N-1; k+=2)
+            MPINPLACE(c[k], c[k+1]);
+        else
+          for (size_t k=1; k<N-1; k+=2)
+            PMINPLACE(c[k+1], c[k]);
+        }
+      return c;
+      }
+    template<typename T> DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct,
+      bool ortho, int type, bool cosine, size_t nthreads=1) const
+      {
+      exec(c, buf, fct, ortho, type, cosine, nthreads);
+      }
+    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct, bool ortho,
+      int type, bool cosine, size_t nthreads=1) const
+      {
+      aligned_array<T> buf(bufsize());
+      exec(c, &buf[0], fct, ortho, type, cosine, nthreads);
+      }
+
+    size_t length() const { return fftplan.length(); }
+    size_t bufsize() const { return fftplan.bufsize(); }
+  };
+
+template<typename T0> class T_dcst4
+  {
+  private:
+    size_t N;
+    std::unique_ptr<pocketfft_c<T0>> fft;
+    std::unique_ptr<pocketfft_r<T0>> rfft;
+    aligned_array<Cmplx<T0>> C2;
+    size_t bufsz;
+
+  public:
+    DUCC0_NOINLINE T_dcst4(size_t length, bool /*vectorize*/=false)
+      : N(length),
+        fft((N&1) ? nullptr : make_unique<pocketfft_c<T0>>(N/2)),
+        rfft((N&1)? make_unique<pocketfft_r<T0>>(N) : nullptr),
+        C2((N&1) ? 0 : N/2),
+        bufsz((N&1) ? (N+rfft->bufsize()) : (N+2*fft->bufsize()))
+      {
+      if ((N&1)==0)
+        {
+        UnityRoots<T0,Cmplx<T0>> tw(16*N);
+        for (size_t i=0; i<N/2; ++i)
+          C2[i] = tw[8*i+1].conj();
+        }
+      }
+
+    template<typename T> DUCC0_NOINLINE T *exec(T c[], T buf[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads) const
+      {
+      size_t n2 = N/2;
+      if (!cosine)
+        for (size_t k=0, kc=N-1; k<n2; ++k, --kc)
+          std::swap(c[k], c[kc]);
+      if (N&1)
+        {
+        // The following code is derived from the FFTW3 function apply_re11()
+        // and is released under the 3-clause BSD license with friendly
+        // permission of Matteo Frigo and Steven G. Johnson.
+
+        auto y = buf;
+        {
+        size_t i=0, m=n2;
+        for (; m<N; ++i, m+=4)
+          y[i] = c[m];
+        for (; m<2*N; ++i, m+=4)
+          y[i] = -c[2*N-m-1];
+        for (; m<3*N; ++i, m+=4)
+          y[i] = -c[m-2*N];
+        for (; m<4*N; ++i, m+=4)
+          y[i] = c[4*N-m-1];
+        for (; i<N; ++i, m+=4)
+          y[i] = c[m-4*N];
+        }
+
+        auto res = rfft->exec(y, y+N, fct, true, nthreads);
+        {
+        auto SGN = [](size_t i)
+           {
+           constexpr T0 sqrt2=T0(1.414213562373095048801688724209698L);
+           return (i&2) ? -sqrt2 : sqrt2;
+           };
+        c[n2] = res[0]*SGN(n2+1);
+        size_t i=0, i1=1, k=1;
+        for (; k<n2; ++i, ++i1, k+=2)
+          {
+          c[i    ] = res[2*k-1]*SGN(i1)     + res[2*k  ]*SGN(i);
+          c[N -i1] = res[2*k-1]*SGN(N -i)   - res[2*k  ]*SGN(N -i1);
+          c[n2-i1] = res[2*k+1]*SGN(n2-i)   - res[2*k+2]*SGN(n2-i1);
+          c[n2+i1] = res[2*k+1]*SGN(n2+i+2) + res[2*k+2]*SGN(n2+i1);
+          }
+        if (k == n2)
+          {
+          c[i   ] = res[2*k-1]*SGN(i+1) + res[2*k]*SGN(i);
+          c[N-i1] = res[2*k-1]*SGN(i+2) + res[2*k]*SGN(i1);
+          }
+        }
+
+        // FFTW-derived code ends here
+        }
+      else
+        {
+        // even length algorithm from
+        // https://www.appletonaudio.com/blog/2013/derivation-of-fast-dct-4-algorithm-based-on-dft/
+        auto y2 = reinterpret_cast<Cmplx<T> *>(buf);
+        for(size_t i=0; i<n2; ++i)
+          {
+          y2[i].Set(c[2*i],c[N-1-2*i]);
+          y2[i] *= C2[i];
+          }
+
+        auto res = fft->exec(y2, y2+N/2, fct, true, nthreads);
+        for(size_t i=0, ic=n2-1; i<n2; ++i, --ic)
+          {
+          c[2*i  ] = T0( 2)*(res[i ].r*C2[i ].r-res[i ].i*C2[i ].i);
+          c[2*i+1] = T0(-2)*(res[ic].i*C2[ic].r+res[ic].r*C2[ic].i);
+          }
+        }
+      if (!cosine)
+        for (size_t k=1; k<N; k+=2)
+          c[k] = -c[k];
+      return c;
+      }
+    template<typename T> DUCC0_NOINLINE void exec_copyback(T c[], T buf[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const
+      {
+      exec(c, buf, fct, true, 4, cosine, nthreads);
+      }
+    template<typename T> DUCC0_NOINLINE void exec(T c[], T0 fct,
+      bool /*ortho*/, int /*type*/, bool cosine, size_t nthreads=1) const
+      {
+      aligned_array<T> buf(bufsize());
+      exec(c, &buf[0], fct, true, 4, cosine, nthreads);
+      }
+
+    size_t length() const { return N; }
+    size_t bufsize() const { return bufsz; }
+  };
+
+using shape_t=fmav_info::shape_t;
+using stride_t=fmav_info::stride_t;
+
+constexpr bool FORWARD  = true,
+               BACKWARD = false;
+
+/// Complex-to-complex Fast Fourier Transform
+/** This executes a Fast Fourier Transform on \a in and stores the result in
+ *  \a out.
+ *
+ *  \a in and \a out must have identical shapes; they may point to the same
+ *  memory; in this case their strides must also be identical.
+ *
+ *  \a axes specifies the axes over which the transform is carried out.
+ *
+ *  If \a forward is true, a minus sign will be used in the exponent.
+ *
+ *  No normalization factors will be applied by default; if multiplication by
+ *  a constant is desired, it can be supplied in \a fct.
+ *
+ *  If the underlying array has more than one dimension, the computation will
+ *  be distributed over \a nthreads threads.
+ */
+template<typename T> DUCC0_NOINLINE void c2c(const cfmav<std::complex<T>> &in,
+  const vfmav<std::complex<T>> &out, const shape_t &axes, bool forward,
+  T fct, size_t nthreads=1);
+
+/// Fast Discrete Cosine Transform
+/** This executes a DCT on \a in and stores the result in \a out.
+ *
+ *  \a in and \a out must have identical shapes; they may point to the same
+ *  memory; in this case their strides must also be identical.
+ *
+ *  \a axes specifies the axes over which the transform is carried out.
+ *
+ *  If \a forward is true, a DCT is computed, otherwise an inverse DCT.
+ *
+ *  \a type specifies the desired type (1-4) of the transform.
+ *
+ *  No normalization factors will be applied by default; if multiplication by
+ *  a constant is desired, it can be supplied in \a fct.
+ *
+ *  If \a ortho is true, the first and last array entries are corrected (if
+ *  necessary) to allow an orthonormalized transform.
+ *
+ *  If the underlying array has more than one dimension, the computation will
+ *  be distributed over \a nthreads threads.
+ */
+template<typename T> DUCC0_NOINLINE void dct(const cfmav<T> &in, const vfmav<T> &out,
+  const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1);
+
+/// Fast Discrete Sine Transform
+/** This executes a DST on \a in and stores the result in \a out.
+ *
+ *  \a in and \a out must have identical shapes; they may point to the same
+ *  memory; in this case their strides must also be identical.
+ *
+ *  \a axes specifies the axes over which the transform is carried out.
+ *
+ *  If \a forward is true, a DST is computed, otherwise an inverse DST.
+ *
+ *  \a type specifies the desired type (1-4) of the transform.
+ *
+ *  No normalization factors will be applied by default; if multiplication by
+ *  a constant is desired, it can be supplied in \a fct.
+ *
+ *  If \a ortho is true, the first and last array entries are corrected (if
+ *  necessary) to allow an orthonormalized transform.
+ *
+ *  If the underlying array has more than one dimension, the computation will
+ *  be distributed over \a nthreads threads.
+ */
+template<typename T> DUCC0_NOINLINE void dst(const cfmav<T> &in, const vfmav<T> &out,
+  const shape_t &axes, int type, T fct, bool ortho, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2c(const cfmav<T> &in,
+  const vfmav<std::complex<T>> &out, size_t axis, bool forward, T fct,
+  size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2c(const cfmav<T> &in,
+  const vfmav<std::complex<T>> &out, const shape_t &axes,
+  bool forward, T fct, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void c2r(const cfmav<std::complex<T>> &in,
+  const vfmav<T> &out,  size_t axis, bool forward, T fct, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void c2r(const cfmav<std::complex<T>> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward, T fct,
+  size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void c2r_mut(const vfmav<std::complex<T>> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward, T fct,
+  size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2r_fftpack(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, bool real2hermitian, bool forward,
+  T fct, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2r_fftw(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward,
+  T fct, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2r_separable_hartley(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void r2r_separable_fht(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads=1);
+
+template<typename T> void r2r_genuine_hartley(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads=1);
+
+template<typename T> void r2r_genuine_fht(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads=1);
+
+/// Convolution and zero-padding/truncation along one axis
+/** This performs a circular convolution with the kernel \a kernel on axis
+ *  \a axis of \a in, applies the necessary zero-padding/truncation on this
+ *  axis to give it the length \a out.shape(axis),and returns the result
+ *  in \a out.
+ *
+ *  The main purpose of this routine is efficiency: the combination of the above
+ *  operations can be carried out more quickly than running the individual
+ *  operations in succession.
+ *
+ *  \a in and \a out must have identical shapes, with the possible exception
+ *  of the axis \a axis; they may point to the same memory; in this case all
+ *  of their strides must be identical.
+ *
+ *  \a axis specifies the axis over which the operation is carried out.
+ *
+ *  \a kernel must have the same length as \a in.shape(axis); it must be
+ *  provided in the same domain as \a in (i.e. not pre-transformed).
+ *
+ *  If \a in has more than one dimension, the computation will
+ *  be distributed over \a nthreads threads.
+ */
+template<typename T> DUCC0_NOINLINE void convolve_axis(const cfmav<T> &in,
+  const vfmav<T> &out, size_t axis, const cmav<T,1> &kernel, size_t nthreads=1);
+
+template<typename T> DUCC0_NOINLINE void convolve_axis(const cfmav<complex<T>> &in,
+  const vfmav<complex<T>> &out, size_t axis, const cmav<complex<T>,1> &kernel,
+  size_t nthreads=1);
+}
+
+using detail_fft::pocketfft_c;
+using detail_fft::pocketfft_r;
+using detail_fft::pocketfft_hartley;
+using detail_fft::pocketfft_fht;
+using detail_fft::pocketfft_fftw;
+
+using detail_fft::FORWARD;
+using detail_fft::BACKWARD;
+using detail_fft::c2c;
+using detail_fft::c2r;
+using detail_fft::c2r_mut;
+using detail_fft::r2c;
+using detail_fft::r2r_fftpack;
+using detail_fft::r2r_fftw;
+using detail_fft::r2r_separable_hartley;
+using detail_fft::r2r_genuine_hartley;
+using detail_fft::r2r_separable_fht;
+using detail_fft::r2r_genuine_fht;
+using detail_fft::dct;
+using detail_fft::dst;
+using detail_fft::convolve_axis;
+
+inline size_t good_size_complex(size_t n)
+  { return detail_fft::util1d::good_size_cmplx(n); }
+inline size_t good_size_real(size_t n)
+  { return detail_fft::util1d::good_size_real(n); }
+
+}
+
+#endif
diff --git a/contrib/ducc0/fft/fft1d_impl.h b/contrib/ducc0/fft/fft1d_impl.h
new file mode 100644
index 000000000..f2bf69361
--- /dev/null
+++ b/contrib/ducc0/fft/fft1d_impl.h
@@ -0,0 +1,2990 @@
+/*
+This file is part of the ducc FFT library
+
+Copyright (C) 2010-2023 Max-Planck-Society
+Copyright (C) 2019 Peter Bell
+
+For the odd-sized DCT-IV transforms:
+  Copyright (C) 2003, 2007-14 Matteo Frigo
+  Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
+
+Authors: Martin Reinecke, Peter Bell
+*/
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_FFT1D_IMPL_H
+#define DUCC0_FFT1D_IMPL_H
+
+#include <memory>
+#include <cstddef>
+#include <algorithm>
+#include <functional>
+#include <type_traits>
+#include <utility>
+#include <vector>
+#include <typeinfo>
+#include <typeindex>
+#include "ducc0/infra/useful_macros.h"
+#include "ducc0/math/cmplx.h"
+#include "ducc0/infra/error_handling.h"
+#include "ducc0/infra/aligned_array.h"
+#include "ducc0/infra/simd.h"
+#include "ducc0/infra/threading.h"
+#include "ducc0/math/unity_roots.h"
+#include "ducc0/fft/fft.h"
+
+namespace ducc0 {
+
+namespace detail_fft {
+
+using namespace std;
+
+// the next line is necessary to address some sloppy name choices in hipSYCL
+using std::min, std::max;
+
+template<typename T> constexpr inline size_t fft1d_simdlen
+  = min<size_t>(8, native_simd<T>::size());
+template<> constexpr inline size_t fft1d_simdlen<double>
+  = min<size_t>(4, native_simd<double>::size());
+template<> constexpr inline size_t fft1d_simdlen<float>
+  = min<size_t>(8, native_simd<float>::size());
+template<typename T> using fft1d_simd = typename simd_select<T,fft1d_simdlen<T>>::type;
+template<typename T> constexpr inline bool fft1d_simd_exists = (fft1d_simdlen<T> > 1);
+
+// Always use std:: for <cmath> functions
+template <typename T> T cos(T) = delete;
+template <typename T> T sin(T) = delete;
+template <typename T> T sqrt(T) = delete;
+
+template<bool fwd, typename T> void ROTX90(Cmplx<T> &a)
+  { auto tmp_= fwd ? -a.r : a.r; a.r = fwd ? a.i : -a.i; a.i=tmp_; }
+
+#define POCKETFFT_EXEC_DISPATCH \
+    virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \
+      bool fwd, size_t nthreads=1) const \
+      { \
+      static const auto tics = tidx<Tcs *>(); \
+      if (ti==tics) \
+        { \
+        auto in1 = static_cast<Tcs *>(in); \
+        auto copy1 = static_cast<Tcs *>(copy); \
+        auto buf1 = static_cast<Tcs *>(buf); \
+        return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                   : exec_<false>(in1, copy1, buf1, nthreads); \
+        } \
+      if constexpr (fft1d_simdlen<Tfs> > 1) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \
+          using Tcv = Cmplx<Tfv>; \
+          static const auto ticv = tidx<Tcv *>(); \
+          if (ti==ticv) \
+            {  \
+            auto in1 = static_cast<Tcv *>(in); \
+            auto copy1 = static_cast<Tcv *>(copy); \
+            auto buf1 = static_cast<Tcv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 2) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \
+          using Tcv = Cmplx<Tfv>; \
+          static const auto ticv = tidx<Tcv *>(); \
+          if (ti==ticv) \
+            {  \
+            auto in1 = static_cast<Tcv *>(in); \
+            auto copy1 = static_cast<Tcv *>(copy); \
+            auto buf1 = static_cast<Tcv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 4) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \
+          using Tcv = Cmplx<Tfv>; \
+          static const auto ticv = tidx<Tcv *>(); \
+          if (ti==ticv) \
+            {  \
+            auto in1 = static_cast<Tcv *>(in); \
+            auto copy1 = static_cast<Tcv *>(copy); \
+            auto buf1 = static_cast<Tcv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 8) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \
+          using Tcv = Cmplx<Tfv>; \
+          static const auto ticv = tidx<Tcv *>(); \
+          if (ti==ticv) \
+            {  \
+            auto in1 = static_cast<Tcv *>(in); \
+            auto copy1 = static_cast<Tcv *>(copy); \
+            auto buf1 = static_cast<Tcv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      MR_fail("impossible vector length requested"); \
+      }
+
+template <typename Tfs> class cfftp1: public cfftpass<Tfs>
+  {
+  public:
+    cfftp1() {}
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return false; }
+
+    virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/,
+      void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const
+      { return in; }
+  };
+
+template <typename Tfs> class cfftp2: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=2;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t i) const
+      { return wa[i-1]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_ (const Tcd * DUCC0_RESTRICT cc,
+      Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const
+      {
+      if (ido==1)
+        {
+        auto CH = [ch,this](size_t b, size_t c) -> Tcd&
+          { return ch[b+l1*c]; };
+        auto CC = [cc](size_t b, size_t c) -> const Tcd&
+          { return cc[b+ip*c]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          CH(k,0) = CC(0,k)+CC(1,k);
+          CH(k,1) = CC(0,k)-CC(1,k);
+          }
+        return ch;
+        }
+      else
+        {
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+          { return ch[a+ido*(b+l1*c)]; };
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+          { return cc[a+ido*(b+ip*c)]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          CH(0,k,0) = CC(0,0,k)+CC(0,1,k);
+          CH(0,k,1) = CC(0,0,k)-CC(0,1,k);
+          for (size_t i=1; i<ido; ++i)
+            {
+            CH(i,k,0) = CC(i,0,k)+CC(i,1,k);
+            special_mul<fwd>(CC(i,0,k)-CC(i,1,k),WA(i),CH(i,k,1));
+            }
+          }
+        return ch;
+        }
+      }
+
+  public:
+    cfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        wa[i-1] = (*roots)[rfct*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp3: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=3;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
+      size_t /*nthreads*/) const
+      {
+      constexpr Tfs tw1r=-0.5,
+                    tw1i= (fwd ? -1: 1) * Tfs(0.8660254037844386467637231707529362L);
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+
+#define POCKETFFT_PREP3(idx) \
+        Tcd t0 = CC(idx,0,k), t1, t2; \
+        PM (t1,t2,CC(idx,1,k),CC(idx,2,k)); \
+        CH(idx,k,0)=t0+t1;
+#define POCKETFFT_PARTSTEP3a(u1,u2,twr,twi) \
+        { \
+        Tcd ca=t0+t1*twr; \
+        Tcd cb{-t2.i*twi, t2.r*twi}; \
+        PM(CH(0,k,u1),CH(0,k,u2),ca,cb) ;\
+        }
+#define POCKETFFT_PARTSTEP3b(u1,u2,twr,twi) \
+        { \
+        Tcd ca=t0+t1*twr; \
+        Tcd cb{-t2.i*twi, t2.r*twi}; \
+        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
+        }
+
+      if (ido==1)
+        for (size_t k=0; k<l1; ++k)
+          {
+          POCKETFFT_PREP3(0)
+          POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
+          }
+      else
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          POCKETFFT_PREP3(0)
+          POCKETFFT_PARTSTEP3a(1,2,tw1r,tw1i)
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            POCKETFFT_PREP3(i)
+            POCKETFFT_PARTSTEP3b(1,2,tw1r,tw1i)
+            }
+          }
+
+#undef POCKETFFT_PARTSTEP3b
+#undef POCKETFFT_PARTSTEP3a
+#undef POCKETFFT_PREP3
+
+      return ch;
+      }
+
+  public:
+    cfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp4: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=4;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
+      size_t /*nthreads*/) const
+      {
+      if (ido==1)
+        {
+        auto CH = [ch,this](size_t b, size_t c) -> Tcd&
+          { return ch[b+l1*c]; };
+        auto CC = [cc](size_t b, size_t c) -> const Tcd&
+          { return cc[b+ip*c]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          Tcd t1, t2, t3, t4;
+          PM(t2,t1,CC(0,k),CC(2,k));
+          PM(t3,t4,CC(1,k),CC(3,k));
+          ROTX90<fwd>(t4);
+          PM(CH(k,0),CH(k,2),t2,t3);
+          PM(CH(k,1),CH(k,3),t1,t4);
+          }
+        }
+      else
+        {
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+          { return ch[a+ido*(b+l1*c)]; };
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+          { return cc[a+ido*(b+ip*c)]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          Tcd t1, t2, t3, t4;
+          PM(t2,t1,CC(0,0,k),CC(0,2,k));
+          PM(t3,t4,CC(0,1,k),CC(0,3,k));
+          ROTX90<fwd>(t4);
+          PM(CH(0,k,0),CH(0,k,2),t2,t3);
+          PM(CH(0,k,1),CH(0,k,3),t1,t4);
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            Tcd t1, t2, t3, t4;
+            Tcd cc0=CC(i,0,k), cc1=CC(i,1,k),cc2=CC(i,2,k),cc3=CC(i,3,k);
+            PM(t2,t1,cc0,cc2);
+            PM(t3,t4,cc1,cc3);
+            ROTX90<fwd>(t4);
+            CH(i,k,0) = t2+t3;
+            special_mul<fwd>(t1+t4,WA(0,i),CH(i,k,1));
+            special_mul<fwd>(t2-t3,WA(1,i),CH(i,k,2));
+            special_mul<fwd>(t1-t4,WA(2,i),CH(i,k,3));
+            }
+          }
+        }
+      return ch;
+      }
+
+  public:
+    cfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp5: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=5;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
+      size_t /*nthreads*/) const
+      {
+      constexpr Tfs tw1r= Tfs(0.3090169943749474241022934171828191L),
+                    tw1i= (fwd ? -1: 1) * Tfs(0.9510565162951535721164393333793821L),
+                    tw2r= Tfs(-0.8090169943749474241022934171828191L),
+                    tw2i= (fwd ? -1: 1) * Tfs(0.5877852522924731291687059546390728L);
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+
+#define POCKETFFT_PREP5(idx) \
+        Tcd t0 = CC(idx,0,k), t1, t2, t3, t4; \
+        PM (t1,t4,CC(idx,1,k),CC(idx,4,k)); \
+        PM (t2,t3,CC(idx,2,k),CC(idx,3,k)); \
+        CH(idx,k,0).r=t0.r+t1.r+t2.r; \
+        CH(idx,k,0).i=t0.i+t1.i+t2.i;
+
+#define POCKETFFT_PARTSTEP5a(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        Tcd ca,cb; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        PM(CH(0,k,u1),CH(0,k,u2),ca,cb); \
+        }
+
+#define POCKETFFT_PARTSTEP5b(u1,u2,twar,twbr,twai,twbi) \
+        { \
+        Tcd ca,cb,da,db; \
+        ca.r=t0.r+twar*t1.r+twbr*t2.r; \
+        ca.i=t0.i+twar*t1.i+twbr*t2.i; \
+        cb.i=twai*t4.r twbi*t3.r; \
+        cb.r=-(twai*t4.i twbi*t3.i); \
+        special_mul<fwd>(ca+cb,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(ca-cb,WA(u2-1,i),CH(i,k,u2)); \
+        }
+
+      if (ido==1)
+        for (size_t k=0; k<l1; ++k)
+          {
+          POCKETFFT_PREP5(0)
+          POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+          POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+          }
+      else
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          POCKETFFT_PREP5(0)
+          POCKETFFT_PARTSTEP5a(1,4,tw1r,tw2r,+tw1i,+tw2i)
+          POCKETFFT_PARTSTEP5a(2,3,tw2r,tw1r,+tw2i,-tw1i)
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            POCKETFFT_PREP5(i)
+            POCKETFFT_PARTSTEP5b(1,4,tw1r,tw2r,+tw1i,+tw2i)
+            POCKETFFT_PARTSTEP5b(2,3,tw2r,tw1r,+tw2i,-tw1i)
+            }
+          }
+
+#undef POCKETFFT_PARTSTEP5b
+#undef POCKETFFT_PARTSTEP5a
+#undef POCKETFFT_PREP5
+
+      return ch;
+      }
+
+  public:
+    cfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp7: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=7;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
+      size_t /*nthreads*/) const
+      {
+      constexpr Tfs tw1r= Tfs(0.6234898018587335305250048840042398L),
+                    tw1i= (fwd ? -1 : 1) * Tfs(0.7818314824680298087084445266740578L),
+                    tw2r= Tfs(-0.2225209339563144042889025644967948L),
+                    tw2i= (fwd ? -1 : 1) * Tfs(0.9749279121818236070181316829939312L),
+                    tw3r= Tfs(-0.9009688679024191262361023195074451L),
+                    tw3i= (fwd ? -1 : 1) * Tfs(0.433883739117558120475768332848359L);
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+
+#define POCKETFFT_PREP7(idx) \
+        Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7; \
+        PM (t2,t7,CC(idx,1,k),CC(idx,6,k)); \
+        PM (t3,t6,CC(idx,2,k),CC(idx,5,k)); \
+        PM (t4,t5,CC(idx,3,k),CC(idx,4,k)); \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i;
+
+#define POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,out1,out2) \
+        { \
+        Tcd ca,cb; \
+        ca.r=t1.r+x1*t2.r+x2*t3.r+x3*t4.r; \
+        ca.i=t1.i+x1*t2.i+x2*t3.i+x3*t4.i; \
+        cb.i=y1*t7.r y2*t6.r y3*t5.r; \
+        cb.r=-(y1*t7.i y2*t6.i y3*t5.i); \
+        PM(out1,out2,ca,cb); \
+        }
+#define POCKETFFT_PARTSTEP7a(u1,u2,x1,x2,x3,y1,y2,y3) \
+        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,CH(0,k,u1),CH(0,k,u2))
+#define POCKETFFT_PARTSTEP7(u1,u2,x1,x2,x3,y1,y2,y3) \
+        { \
+        Tcd da,db; \
+        POCKETFFT_PARTSTEP7a0(u1,u2,x1,x2,x3,y1,y2,y3,da,db) \
+        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
+        }
+
+      if (ido==1)
+        for (size_t k=0; k<l1; ++k)
+          {
+          POCKETFFT_PREP7(0)
+          POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+          POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+          POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+          }
+      else
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          POCKETFFT_PREP7(0)
+          POCKETFFT_PARTSTEP7a(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+          POCKETFFT_PARTSTEP7a(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+          POCKETFFT_PARTSTEP7a(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            POCKETFFT_PREP7(i)
+            POCKETFFT_PARTSTEP7(1,6,tw1r,tw2r,tw3r,+tw1i,+tw2i,+tw3i)
+            POCKETFFT_PARTSTEP7(2,5,tw2r,tw3r,tw1r,+tw2i,-tw3i,-tw1i)
+            POCKETFFT_PARTSTEP7(3,4,tw3r,tw1r,tw2r,+tw3i,-tw1i,+tw2i)
+            }
+          }
+
+#undef POCKETFFT_PARTSTEP7
+#undef POCKETFFT_PARTSTEP7a0
+#undef POCKETFFT_PARTSTEP7a
+#undef POCKETFFT_PREP7
+
+      return ch;
+      }
+
+  public:
+    cfftp7(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp8: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=8;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template <bool fwd, typename T> void ROTX45(T &a) const
+      {
+      constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L);
+      if constexpr (fwd)
+        { auto tmp_=a.r; a.r=hsqt2*(a.r+a.i); a.i=hsqt2*(a.i-tmp_); }
+      else
+        { auto tmp_=a.r; a.r=hsqt2*(a.r-a.i); a.i=hsqt2*(a.i+tmp_); }
+      }
+    template <bool fwd, typename T> void ROTX135(T &a) const
+      {
+      constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L);
+      if constexpr (fwd)
+        { auto tmp_=a.r; a.r=hsqt2*(a.i-a.r); a.i=hsqt2*(-tmp_-a.i); }
+      else
+        { auto tmp_=a.r; a.r=hsqt2*(-a.r-a.i); a.i=hsqt2*(tmp_-a.i); }
+      }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const
+      {
+      if (l1==1)
+        {
+        auto CC = [cc,this](size_t a, size_t b) -> Tcd&
+          { return cc[a+ido*b]; };
+        {
+        Tcd a0, a1, a2, a3, a4, a5, a6, a7;
+        PM(a1,a5,CC(0,1),CC(0,5));
+        PM(a3,a7,CC(0,3),CC(0,7));
+        PMINPLACE(a1,a3);
+        ROTX90<fwd>(a3);
+
+        ROTX90<fwd>(a7);
+        PMINPLACE(a5,a7);
+        ROTX45<fwd>(a5);
+        ROTX135<fwd>(a7);
+
+        PM(a0,a4,CC(0,0),CC(0,4));
+        PM(a2,a6,CC(0,2),CC(0,6));
+        PM(CC(0,0),CC(0,4),a0+a2,a1);
+        PM(CC(0,2),CC(0,6),a0-a2,a3);
+        ROTX90<fwd>(a6);
+        PM(CC(0,1),CC(0,5),a4+a6,a5);
+        PM(CC(0,3),CC(0,7),a4-a6,a7);
+        }
+        for (size_t i=1; i<ido; ++i)
+          {
+          Tcd a0, a1, a2, a3, a4, a5, a6, a7;
+          PM(a1,a5,CC(i,1),CC(i,5));
+          PM(a3,a7,CC(i,3),CC(i,7));
+          ROTX90<fwd>(a7);
+          PMINPLACE(a1,a3);
+          ROTX90<fwd>(a3);
+          PMINPLACE(a5,a7);
+          ROTX45<fwd>(a5);
+          ROTX135<fwd>(a7);
+          PM(a0,a4,CC(i,0),CC(i,4));
+          PM(a2,a6,CC(i,2),CC(i,6));
+          PMINPLACE(a0,a2);
+          CC(i,0) = a0+a1;
+          special_mul<fwd>(a0-a1,WA(3,i),CC(i,4));
+          special_mul<fwd>(a2+a3,WA(1,i),CC(i,2));
+          special_mul<fwd>(a2-a3,WA(5,i),CC(i,6));
+          ROTX90<fwd>(a6);
+          PMINPLACE(a4,a6);
+          special_mul<fwd>(a4+a5,WA(0,i),CC(i,1));
+          special_mul<fwd>(a4-a5,WA(4,i),CC(i,5));
+          special_mul<fwd>(a6+a7,WA(2,i),CC(i,3));
+          special_mul<fwd>(a6-a7,WA(6,i),CC(i,7));
+          }
+        return cc;
+        }
+      if (ido==1)
+        {
+        auto CH = [ch,this](size_t b, size_t c) -> Tcd&
+          { return ch[b+l1*c]; };
+        auto CC = [cc](size_t b, size_t c) -> const Tcd&
+          { return cc[b+ip*c]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          Tcd a0, a1, a2, a3, a4, a5, a6, a7;
+          PM(a1,a5,CC(1,k),CC(5,k));
+          PM(a3,a7,CC(3,k),CC(7,k));
+          PMINPLACE(a1,a3);
+          ROTX90<fwd>(a3);
+
+          ROTX90<fwd>(a7);
+          PMINPLACE(a5,a7);
+          ROTX45<fwd>(a5);
+          ROTX135<fwd>(a7);
+
+          PM(a0,a4,CC(0,k),CC(4,k));
+          PM(a2,a6,CC(2,k),CC(6,k));
+          PM(CH(k,0),CH(k,4),a0+a2,a1);
+          PM(CH(k,2),CH(k,6),a0-a2,a3);
+          ROTX90<fwd>(a6);
+          PM(CH(k,1),CH(k,5),a4+a6,a5);
+          PM(CH(k,3),CH(k,7),a4-a6,a7);
+          }
+        }
+      else
+        {
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+          { return ch[a+ido*(b+l1*c)]; };
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+          { return cc[a+ido*(b+ip*c)]; };
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          Tcd a0, a1, a2, a3, a4, a5, a6, a7;
+          PM(a1,a5,CC(0,1,k),CC(0,5,k));
+          PM(a3,a7,CC(0,3,k),CC(0,7,k));
+          PMINPLACE(a1,a3);
+          ROTX90<fwd>(a3);
+
+          ROTX90<fwd>(a7);
+          PMINPLACE(a5,a7);
+          ROTX45<fwd>(a5);
+          ROTX135<fwd>(a7);
+
+          PM(a0,a4,CC(0,0,k),CC(0,4,k));
+          PM(a2,a6,CC(0,2,k),CC(0,6,k));
+          PM(CH(0,k,0),CH(0,k,4),a0+a2,a1);
+          PM(CH(0,k,2),CH(0,k,6),a0-a2,a3);
+          ROTX90<fwd>(a6);
+          PM(CH(0,k,1),CH(0,k,5),a4+a6,a5);
+          PM(CH(0,k,3),CH(0,k,7),a4-a6,a7);
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            Tcd a0, a1, a2, a3, a4, a5, a6, a7;
+            PM(a1,a5,CC(i,1,k),CC(i,5,k));
+            PM(a3,a7,CC(i,3,k),CC(i,7,k));
+            ROTX90<fwd>(a7);
+            PMINPLACE(a1,a3);
+            ROTX90<fwd>(a3);
+            PMINPLACE(a5,a7);
+            ROTX45<fwd>(a5);
+            ROTX135<fwd>(a7);
+            PM(a0,a4,CC(i,0,k),CC(i,4,k));
+            PM(a2,a6,CC(i,2,k),CC(i,6,k));
+            PMINPLACE(a0,a2);
+            CH(i,k,0) = a0+a1;
+            special_mul<fwd>(a0-a1,WA(3,i),CH(i,k,4));
+            special_mul<fwd>(a2+a3,WA(1,i),CH(i,k,2));
+            special_mul<fwd>(a2-a3,WA(5,i),CH(i,k,6));
+            ROTX90<fwd>(a6);
+            PMINPLACE(a4,a6);
+            special_mul<fwd>(a4+a5,WA(0,i),CH(i,k,1));
+            special_mul<fwd>(a4-a5,WA(4,i),CH(i,k,5));
+            special_mul<fwd>(a6+a7,WA(2,i),CH(i,k,3));
+            special_mul<fwd>(a6-a7,WA(6,i),CH(i,k,7));
+            }
+          }
+        }
+      return ch;
+      }
+
+  public:
+    cfftp8(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return l1>1; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftp11: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    static constexpr size_t ip=11;
+    aligned_array<Tcs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[x+(i-1)*(ip-1)]; }
+
+    template<bool fwd, typename Tcd> [[gnu::hot]] Tcd *exec_
+      (const Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/,
+      size_t /*nthreads*/) const
+      {
+      constexpr Tfs tw1r= Tfs(0.8412535328311811688618116489193677L),
+                    tw1i= (fwd ? -1 : 1) * Tfs(0.5406408174555975821076359543186917L),
+                    tw2r= Tfs(0.4154150130018864255292741492296232L),
+                    tw2i= (fwd ? -1 : 1) * Tfs(0.9096319953545183714117153830790285L),
+                    tw3r= Tfs(-0.1423148382732851404437926686163697L),
+                    tw3i= (fwd ? -1 : 1) * Tfs(0.9898214418809327323760920377767188L),
+                    tw4r= Tfs(-0.6548607339452850640569250724662936L),
+                    tw4i= (fwd ? -1 : 1) * Tfs(0.7557495743542582837740358439723444L),
+                    tw5r= Tfs(-0.9594929736144973898903680570663277L),
+                    tw5i= (fwd ? -1 : 1) * Tfs(0.2817325568414296977114179153466169L);
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+
+#define POCKETFFT_PREP11(idx) \
+        Tcd t1 = CC(idx,0,k), t2, t3, t4, t5, t6, t7, t8, t9, t10, t11; \
+        PM (t2,t11,CC(idx,1,k),CC(idx,10,k)); \
+        PM (t3,t10,CC(idx,2,k),CC(idx, 9,k)); \
+        PM (t4,t9 ,CC(idx,3,k),CC(idx, 8,k)); \
+        PM (t5,t8 ,CC(idx,4,k),CC(idx, 7,k)); \
+        PM (t6,t7 ,CC(idx,5,k),CC(idx, 6,k)); \
+        CH(idx,k,0).r=t1.r+t2.r+t3.r+t4.r+t5.r+t6.r; \
+        CH(idx,k,0).i=t1.i+t2.i+t3.i+t4.i+t5.i+t6.i;
+
+#define POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,out1,out2) \
+        { \
+        Tcd ca = t1 + t2*x1 + t3*x2 + t4*x3 + t5*x4 +t6*x5, \
+            cb; \
+        cb.i=y1*t11.r y2*t10.r y3*t9.r y4*t8.r y5*t7.r; \
+        cb.r=-(y1*t11.i y2*t10.i y3*t9.i y4*t8.i y5*t7.i ); \
+        PM(out1,out2,ca,cb); \
+        }
+#define POCKETFFT_PARTSTEP11a(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,CH(0,k,u1),CH(0,k,u2))
+#define POCKETFFT_PARTSTEP11(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5) \
+        { \
+        Tcd da,db; \
+        POCKETFFT_PARTSTEP11a0(u1,u2,x1,x2,x3,x4,x5,y1,y2,y3,y4,y5,da,db) \
+        special_mul<fwd>(da,WA(u1-1,i),CH(i,k,u1)); \
+        special_mul<fwd>(db,WA(u2-1,i),CH(i,k,u2)); \
+        }
+
+      if (ido==1)
+        for (size_t k=0; k<l1; ++k)
+          {
+          POCKETFFT_PREP11(0)
+          POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+          POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+          POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+          POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+          POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+          }
+      else
+        for (size_t k=0; k<l1; ++k)
+          {
+          {
+          POCKETFFT_PREP11(0)
+          POCKETFFT_PARTSTEP11a(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+          POCKETFFT_PARTSTEP11a(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+          POCKETFFT_PARTSTEP11a(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+          POCKETFFT_PARTSTEP11a(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+          POCKETFFT_PARTSTEP11a(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+          }
+          for (size_t i=1; i<ido; ++i)
+            {
+            POCKETFFT_PREP11(i)
+            POCKETFFT_PARTSTEP11(1,10,tw1r,tw2r,tw3r,tw4r,tw5r,+tw1i,+tw2i,+tw3i,+tw4i,+tw5i)
+            POCKETFFT_PARTSTEP11(2, 9,tw2r,tw4r,tw5r,tw3r,tw1r,+tw2i,+tw4i,-tw5i,-tw3i,-tw1i)
+            POCKETFFT_PARTSTEP11(3, 8,tw3r,tw5r,tw2r,tw1r,tw4r,+tw3i,-tw5i,-tw2i,+tw1i,+tw4i)
+            POCKETFFT_PARTSTEP11(4, 7,tw4r,tw3r,tw1r,tw5r,tw2r,+tw4i,-tw3i,+tw1i,+tw5i,-tw2i)
+            POCKETFFT_PARTSTEP11(5, 6,tw5r,tw1r,tw4r,tw2r,tw3r,+tw5i,-tw1i,+tw4i,-tw2i,+tw3i)
+            }
+          }
+
+#undef POCKETFFT_PARTSTEP11
+#undef POCKETFFT_PARTSTEP11a0
+#undef POCKETFFT_PARTSTEP11a
+#undef POCKETFFT_PREP11
+      return ch;
+      }
+
+  public:
+    cfftp11(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t i=1; i<ido; ++i)
+        for (size_t j=1; j<ip; ++j)
+          wa[(j-1)+(i-1)*(ip-1)] = (*roots)[rfct*j*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftpg: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    size_t l1, ido;
+    size_t ip;
+    aligned_array<Tcs> wa;
+    aligned_array<Tcs> csarr;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i-1+x*(ido-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch, Tcd * /*buf*/, size_t /*nthreads*/) const
+      {
+      size_t ipph = (ip+1)/2;
+      size_t idl1 = ido*l1;
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+      auto CX = [cc,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return cc[a+ido*(b+l1*c)]; };
+      auto CX2 = [cc, idl1](size_t a, size_t b) -> Tcd&
+        { return cc[a+idl1*b]; };
+      auto CH2 = [ch, idl1](size_t a, size_t b) -> const Tcd&
+        { return ch[a+idl1*b]; };
+
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=0; i<ido; ++i)
+          CH(i,k,0) = CC(i,0,k);
+      for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+        for (size_t k=0; k<l1; ++k)
+          for (size_t i=0; i<ido; ++i)
+            PM(CH(i,k,j),CH(i,k,jc),CC(i,j,k),CC(i,jc,k));
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=0; i<ido; ++i)
+          {
+          Tcd tmp = CH(i,k,0);
+          for (size_t j=1; j<ipph; ++j)
+            tmp+=CH(i,k,j);
+          CX(i,k,0) = tmp;
+          }
+      for (size_t l=1, lc=ip-1; l<ipph; ++l, --lc)
+        {
+        // j=0,1,2
+        {
+        auto wal  = fwd ? csarr[  l].conj() : csarr[  l];
+        auto wal2 = fwd ? csarr[2*l].conj() : csarr[2*l];
+        for (size_t ik=0; ik<idl1; ++ik)
+          {
+          CX2(ik,l ).r = CH2(ik,0).r+wal.r*CH2(ik,1).r+wal2.r*CH2(ik,2).r;
+          CX2(ik,l ).i = CH2(ik,0).i+wal.r*CH2(ik,1).i+wal2.r*CH2(ik,2).i;
+          CX2(ik,lc).r =-wal.i*CH2(ik,ip-1).i-wal2.i*CH2(ik,ip-2).i;
+          CX2(ik,lc).i = wal.i*CH2(ik,ip-1).r+wal2.i*CH2(ik,ip-2).r;
+          }
+        }
+
+        size_t iwal=2*l;
+        size_t j=3, jc=ip-3;
+        for (; j<ipph-1; j+=2, jc-=2)
+          {
+          iwal+=l; if (iwal>ip) iwal-=ip;
+          Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
+          iwal+=l; if (iwal>ip) iwal-=ip;
+          Tcs xwal2=fwd ? csarr[iwal].conj() : csarr[iwal];
+          for (size_t ik=0; ik<idl1; ++ik)
+            {
+            CX2(ik,l).r += CH2(ik,j).r*xwal.r+CH2(ik,j+1).r*xwal2.r;
+            CX2(ik,l).i += CH2(ik,j).i*xwal.r+CH2(ik,j+1).i*xwal2.r;
+            CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i+CH2(ik,jc-1).i*xwal2.i;
+            CX2(ik,lc).i += CH2(ik,jc).r*xwal.i+CH2(ik,jc-1).r*xwal2.i;
+            }
+          }
+        for (; j<ipph; ++j, --jc)
+          {
+          iwal+=l; if (iwal>ip) iwal-=ip;
+          Tcs xwal=fwd ? csarr[iwal].conj() : csarr[iwal];
+          for (size_t ik=0; ik<idl1; ++ik)
+            {
+            CX2(ik,l).r += CH2(ik,j).r*xwal.r;
+            CX2(ik,l).i += CH2(ik,j).i*xwal.r;
+            CX2(ik,lc).r -= CH2(ik,jc).i*xwal.i;
+            CX2(ik,lc).i += CH2(ik,jc).r*xwal.i;
+            }
+          }
+        }
+
+      // shuffling and twiddling
+      if (ido==1)
+        for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)
+          for (size_t ik=0; ik<idl1; ++ik)
+            {
+            Tcd t1=CX2(ik,j), t2=CX2(ik,jc);
+            PM(CX2(ik,j),CX2(ik,jc),t1,t2);
+            }
+      else
+        {
+        for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)
+          for (size_t k=0; k<l1; ++k)
+            {
+            Tcd t1=CX(0,k,j), t2=CX(0,k,jc);
+            PM(CX(0,k,j),CX(0,k,jc),t1,t2);
+            for (size_t i=1; i<ido; ++i)
+              {
+              Tcd x1, x2;
+              PM(x1,x2,CX(i,k,j),CX(i,k,jc));
+              size_t idij=(j-1)*(ido-1)+i-1;
+              special_mul<fwd>(x1,wa[idij],CX(i,k,j));
+              idij=(jc-1)*(ido-1)+i-1;
+              special_mul<fwd>(x2,wa[idij],CX(i,k,jc));
+              }
+            }
+        }
+      return cc;
+      }
+
+  public:
+    cfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(ip)
+      {
+      MR_assert((ip&1)&&(ip>=5), "need an odd number >=5");
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<ido; ++i)
+          wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i];
+      for (size_t i=0; i<ip; ++i)
+        csarr[i] = (*roots)[rfct*ido*l1*i];
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfftpblue: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+
+    const size_t l1, ido, ip;
+    const size_t ip2;
+    const Tcpass<Tfs> subplan;
+    aligned_array<Tcs> wa, bk, bkf;
+    size_t bufsz;
+    bool need_cpy;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i-1+x*(ido-1)]; }
+
+    template<bool fwd, typename Tcd> Tcd *exec_
+      (Tcd * DUCC0_RESTRICT cc, Tcd * DUCC0_RESTRICT ch,
+       Tcd * DUCC0_RESTRICT buf, size_t nthreads) const
+      {
+      static const auto ti=tidx<Tcd *>();
+      Tcd *akf = &buf[0];
+      Tcd *akf2 = subplan->needs_copy() ? (&buf[ip2]) : akf;
+      Tcd *subbuf = akf2+ip2;
+
+      auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return ch[a+ido*(b+l1*c)]; };
+      auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tcd&
+        { return cc[a+ido*(b+ip*c)]; };
+
+//FIXME: parallelize here?
+      for (size_t k=0; k<l1; ++k)
+        for (size_t i=0; i<ido; ++i)
+          {
+          /* initialize a_k and FFT it */
+          for (size_t m=0; m<ip; ++m)
+            special_mul<fwd>(CC(i,m,k),bk[m],akf[m]);
+          auto zero = akf[0]*Tfs(0);
+          for (size_t m=ip; m<ip2; ++m)
+            akf[m]=zero;
+
+          auto res = static_cast<Tcd *>(subplan->exec(ti,akf,akf2,
+            subbuf, true, nthreads));
+
+          /* do the convolution */
+          res[0] = res[0].template special_mul<!fwd>(bkf[0]);
+          for (size_t m=1; m<(ip2+1)/2; ++m)
+            {
+            res[m] = res[m].template special_mul<!fwd>(bkf[m]);
+            res[ip2-m] = res[ip2-m].template special_mul<!fwd>(bkf[m]);
+            }
+          if ((ip2&1)==0)
+            res[ip2/2] = res[ip2/2].template special_mul<!fwd>(bkf[ip2/2]);
+
+          /* inverse FFT */
+          res = static_cast<Tcd *>(subplan->exec(ti, res,
+            (res==akf) ? akf2 : akf, subbuf, false, nthreads));
+
+          /* multiply by b_k and write to output buffer */
+          if (l1>1)
+            {
+            if (i==0)
+              for (size_t m=0; m<ip; ++m)
+                CH(0,k,m) = res[m].template special_mul<fwd>(bk[m]);
+            else
+              {
+              CH(i,k,0) = res[0].template special_mul<fwd>(bk[0]);
+              for (size_t m=1; m<ip; ++m)
+                CH(i,k,m) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i));
+              }
+            }
+          else
+            {
+            if (i==0)
+              for (size_t m=0; m<ip; ++m)
+                CC(0,m,0) = res[m].template special_mul<fwd>(bk[m]);
+            else
+              {
+              CC(i,0,0) = res[0].template special_mul<fwd>(bk[0]);
+              for (size_t m=1; m<ip; ++m)
+                CC(i,m,0) = res[m].template special_mul<fwd>(bk[m]*WA(m-1,i));
+              }
+            }
+          }
+
+      return (l1>1) ? ch : cc;
+      }
+
+  public:
+    cfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots,
+      bool vectorize=false)
+      : l1(l1_), ido(ido_), ip(ip_), ip2(util1d::good_size_cmplx(ip*2-1)),
+        subplan(cfftpass<Tfs>::make_pass(ip2, vectorize)), wa((ip-1)*(ido-1)),
+        bk(ip), bkf(ip2/2+1)
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<ido; ++i)
+          wa[(j-1)*(ido-1)+i-1] = (*roots)[rfct*j*l1*i];
+
+      /* initialize b_k */
+      bk[0].Set(1, 0);
+      size_t coeff=0;
+      auto roots2 = ((roots->size()/(2*ip))*2*ip==roots->size()) ?
+                    roots : make_shared<const UnityRoots<Tfs,Tcs>>(2*ip);
+      size_t rfct2 = roots2->size()/(2*ip);
+      for (size_t m=1; m<ip; ++m)
+        {
+        coeff+=2*m-1;
+        if (coeff>=2*ip) coeff-=2*ip;
+        bk[m] = (*roots2)[coeff*rfct2];
+        }
+
+      /* initialize the zero-padded, Fourier transformed b_k. Add normalisation. */
+      aligned_array<Tcs> tbkf(ip2), tbkf2(ip2);
+      Tfs xn2 = Tfs(1)/Tfs(ip2);
+      tbkf[0] = bk[0]*xn2;
+      for (size_t m=1; m<ip; ++m)
+        tbkf[m] = tbkf[ip2-m] = bk[m]*xn2;
+      for (size_t m=ip;m<=(ip2-ip);++m)
+        tbkf[m].Set(0.,0.);
+      aligned_array<Tcs> buf(subplan->bufsize());
+      static const auto tics=tidx<Tcs *>();
+      auto res = static_cast<Tcs *>(subplan->exec(tics, tbkf.data(),
+        tbkf2.data(), buf.data(), true));
+      for (size_t i=0; i<ip2/2+1; ++i)
+        bkf[i] = res[i];
+
+      need_cpy = l1>1;
+      bufsz = ip2*(1+subplan->needs_copy()) + subplan->bufsize();
+      }
+
+    virtual size_t bufsize() const { return bufsz; }
+    virtual bool needs_copy() const { return need_cpy; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class cfft_multipass: public cfftpass<Tfs>
+  {
+  private:
+    using typename cfftpass<Tfs>::Tcs;
+    static constexpr size_t bunchsize=8;
+
+    const size_t l1, ido;
+    size_t ip;
+    vector<Tcpass<Tfs>> passes;
+    size_t bufsz;
+    bool need_cpy;
+    size_t rfct;
+    Troots<Tfs> myroots;
+
+// FIXME split into sub-functions. This is too long!
+    template<bool fwd, typename T> Cmplx<T> *exec_(Cmplx<T> *cc, Cmplx<T> *ch,
+      Cmplx<T> *buf, size_t nthreads) const
+      {
+      using Tc = Cmplx<T>;
+      if ((l1==1) && (ido==1)) // no chance at vectorizing
+        {
+        static const auto tic=tidx<Tc *>();
+        Tc *p1=cc, *p2=ch;
+        for(const auto &pass: passes)
+          {
+          auto res = static_cast<Tc *>(pass->exec(tic, p1, p2, buf,
+            fwd, nthreads));
+          if (res==p2) swap (p1,p2);
+          }
+        return p1;
+        }
+      else
+        {
+        if constexpr(is_same<T,Tfs>::value && fft1d_simd_exists<Tfs>) // we can vectorize!
+          {
+          using Tfv = fft1d_simd<Tfs>;
+          using Tcv = Cmplx<Tfv>;
+          constexpr size_t vlen = Tfv::size();
+          size_t nvtrans = (l1*ido + vlen-1)/vlen;
+          // NOTE: removed "static" here, because it leads to trouble with gcc 7
+          // static const type_index ticv = tidx<Tcv *>();
+          const type_index ticv = tidx<Tcv *>();
+
+          if (ido==1)
+            {
+            auto CH = [ch,this](size_t b, size_t c) -> Tc&
+              { return ch[b+l1*c]; };
+            auto CC = [cc,this](size_t b, size_t c) -> Tc&
+              { return cc[b+ip*c]; };
+
+            execStatic(nvtrans, nthreads, 0, [&](auto &sched)
+              {
+              aligned_array<Tcv> tbuf(2*ip+32+bufsize());
+              auto cc2 = &tbuf[0];
+              auto ch2 = &tbuf[ip+16];
+              auto buf2 = &tbuf[2*ip+32];
+
+              while (auto rng=sched.getNext())
+                for(auto itrans=rng.lo; itrans<rng.hi; ++itrans)
+                  {
+                  for (size_t n=0; n<vlen; ++n)
+                    for (size_t m=0; m<ip; ++m)
+                      {
+                      size_t k = min(l1-1, itrans*vlen+n);
+                      cc2[m].r[n] = CC(m,k).r;
+                      cc2[m].i[n] = CC(m,k).i;
+                      }
+
+                  Tcv *p1=cc2, *p2=ch2;
+                  for(const auto &pass: passes)
+                    {
+                    auto res = static_cast<Tcv *>(pass->exec(ticv,
+                      p1, p2, buf2, fwd));
+                    if (res==p2) swap (p1,p2);
+                    }
+
+                  for (size_t m=0; m<ip; ++m)
+                    for (size_t n=0; n<vlen; ++n)
+                      {
+                      auto k = min(l1-1, itrans*vlen+n);
+                      CH(k,m) = { p1[m].r[n], p1[m].i[n] };
+                      }
+                  }
+              });
+            return ch;
+            }
+
+          if (l1==1)
+            {
+            auto CC = [cc,this](size_t a, size_t b) -> Tc&
+              { return cc[a+ido*b]; };
+
+            execStatic(nvtrans, nthreads, 0, [&](auto &sched)
+              {
+              aligned_array<Tcv> tbuf(2*ip+32+bufsize());
+              auto cc2 = &tbuf[0];
+              auto ch2 = &tbuf[ip+16];
+              auto buf2 = &tbuf[2*ip+32];
+
+              while (auto rng=sched.getNext())
+                for(auto itrans=rng.lo; itrans<rng.hi; ++itrans)
+                  {
+                  for (size_t m=0; m<ip; ++m)
+                    for (size_t n=0; n<vlen; ++n)
+                      {
+                      size_t i = min(ido-1, itrans*vlen+n);
+                      cc2[m].r[n] = CC(i,m).r;
+                      cc2[m].i[n] = CC(i,m).i;
+                      }
+
+                  Tcv *p1=cc2, *p2=ch2;
+                  for(const auto &pass: passes)
+                    {
+                    auto res = static_cast<Tcv *>(pass->exec(ticv,
+                      p1, p2, buf2, fwd));
+                    if (res==p2) swap (p1,p2);
+                    }
+
+                  for (size_t m=0; m<ip; ++m)
+                    for (size_t n=0; n<vlen; ++n)
+                      {
+                      auto i = itrans*vlen+n;
+                      if (i >= ido) break;
+                      if (i==0)
+                        CC(0,m) = { p1[m].r[n], p1[m].i[n] };
+                      else
+                        {
+                        if (m==0)
+                          CC(i,0) = { p1[0].r[n], p1[0].i[n] } ;
+                        else
+                          CC(i,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*m*i]);
+                        }
+                      }
+                  }
+              });
+            return cc;
+            }
+
+MR_fail("must not get here");
+#if 0
+//FIXME this code path is currently unused
+          aligned_array<Tcv> tbuf(2*ip+bufsize());
+          auto cc2 = &tbuf[0];
+          auto ch2 = &tbuf[ip];
+          auto buf2 = &tbuf[2*ip];
+
+          auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
+            { return ch[a+ido*(b+l1*c)]; };
+          auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
+            { return cc[a+ido*(b+ip*c)]; };
+
+//FIXME parallelize?
+          for (size_t itrans=0; itrans<nvtrans; ++itrans)
+            {
+            array<size_t, vlen> ix, kx;
+            size_t ixcur = (itrans*vlen)%ido;
+            size_t kxcur = (itrans*vlen)/ido;
+            for (size_t n=0; n<vlen; ++n)
+              {
+              ix[n] = ixcur;
+              kx[n] = min(l1-1,kxcur);
+              if (++ixcur==ido)
+                {
+                ixcur=0;
+                ++kxcur;
+                }
+              }
+
+            for (size_t m=0; m<ip; ++m)
+              for (size_t n=0; n<vlen; ++n)
+                {
+                cc2[m].r[n] = CC(ix[n],m,kx[n]).r;
+                cc2[m].i[n] = CC(ix[n],m,kx[n]).i;
+                }
+
+            Tcv *p1=cc2, *p2=ch2;
+            for(const auto &pass: passes)
+              {
+              auto res = static_cast<Tcv *>(pass->exec(ticv,
+                p1, p2, buf2, fwd));
+              if (res==p2) swap (p1,p2);
+              }
+
+            for (size_t m=0; m<ip; ++m)
+              for (size_t n=0; n<vlen; ++n)
+                {
+                auto i = ix[n];
+                auto k = kx[n];
+                if (itrans*vlen+n >= l1*ido) break;
+                if (i==0)
+                  CH(0,k,m) = { p1[m].r[n], p1[m].i[n] };
+                else
+                  {
+                  if (m==0)
+                    CH(i,k,0) = { p1[0].r[n], p1[0].i[n] } ;
+                  else
+                    CH(i,k,m) = Tcs(p1[m].r[n],p1[m].i[n]).template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
+                  }
+                }
+            }
+          return ch;
+#endif
+          }
+        else
+          {
+          static const auto tic = tidx<Cmplx<T> *>();
+          if (ido==1)
+            {
+// parallelize here!
+            for (size_t n=0; n<l1; ++n)
+              {
+              Cmplx<T> *p1=&cc[n*ip], *p2=ch;
+              Cmplx<T> *res = nullptr;
+              for(const auto &pass: passes)
+                {
+                res = static_cast<Cmplx<T> *>(pass->exec(tic,
+                  p1, p2, buf, fwd));
+                if (res==p2) swap (p1,p2);
+                }
+              if (res != &cc[n*ip])
+                copy(res, res+ip, cc+n*ip);
+              }
+            // transpose
+            size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
+// parallelize here!
+            for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
+              {
+              size_t ntrans = min(bunchsize, l1-ibunch*bunchsize);
+              for (size_t m=0; m<ip; ++m)
+                for (size_t n=0; n<ntrans; ++n)
+                  {
+                  size_t itrans = ibunch*bunchsize + n;
+                  ch[itrans+m*l1] = cc[m+itrans*ip];
+                  }
+              }
+            return ch;
+            }
+          if (l1==1)
+            {
+            auto cc2 = &buf[0];
+            auto ch2 = &buf[bunchsize*ip];
+            auto buf2 = &buf[(bunchsize+1)*ip];
+            size_t nbunch = (ido + bunchsize-1)/bunchsize;
+
+            auto CC = [cc,this](size_t a, size_t b) -> Tc&
+              { return cc[a+ido*b]; };
+
+// parallelize here!
+             for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
+              {
+              size_t ntrans = min(bunchsize, ido-ibunch*bunchsize);
+
+              for (size_t m=0; m<ip; ++m)
+                for (size_t n=0; n<ntrans; ++n)
+                  cc2[m+n*ip] = CC(n+ibunch*bunchsize,m);
+
+              for (size_t n=0; n<ntrans; ++n)
+                {
+                auto i = n+ibunch*bunchsize;
+                Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
+                Cmplx<T> *res = nullptr;
+                for(const auto &pass: passes)
+                  {
+                  res = static_cast<Cmplx<T> *>(pass->exec(tic,
+                    p1, p2, buf2, fwd));
+                  if (res==p2) swap (p1,p2);
+                  }
+                if (res==&cc2[n*ip]) // no copying necessary
+                  {
+                  if (i!=0)
+                    {
+                    for (size_t m=1; m<ip; ++m)
+                      cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*m*i]);
+                    }
+                  }
+                else
+                  {
+                  if (i==0)
+                    for (size_t m=0; m<ip; ++m)
+                      cc2[n*ip+m] = res[m];
+                  else
+                    {
+                    cc2[n*ip] = res[0];
+                    for (size_t m=1; m<ip; ++m)
+                      cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*m*i]);
+                    }
+                  }
+                }
+              for (size_t m=0; m<ip; ++m)
+                for (size_t n=0; n<ntrans; ++n)
+                  CC(n+ibunch*bunchsize, m) = cc2[m+n*ip];
+              }
+            return cc;
+            }
+
+MR_fail("must not get here");
+#if 0
+//FIXME this code path is currently unused
+          auto cc2 = &buf[0];
+          auto ch2 = &buf[bunchsize*ip];
+          auto buf2 = &buf[(bunchsize+1)*ip];
+          size_t nbunch = (l1*ido + bunchsize-1)/bunchsize;
+
+          auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tc&
+            { return ch[a+ido*(b+l1*c)]; };
+          auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tc&
+            { return cc[a+ido*(b+ip*c)]; };
+
+// parallelize here!
+          for (size_t ibunch=0; ibunch<nbunch; ++ibunch)
+            {
+            size_t ntrans = min(bunchsize, l1*ido-ibunch*bunchsize);
+            array<size_t, bunchsize> ix, kx;
+            size_t ixcur = (ibunch*bunchsize)%ido;
+            size_t kxcur = (ibunch*bunchsize)/ido;
+            for (size_t n=0; n<bunchsize; ++n)
+              {
+              ix[n] = ixcur;
+              kx[n] = min(l1-1,kxcur);
+              if (++ixcur==ido)
+                {
+                ixcur=0;
+                ++kxcur;
+                }
+              }
+            for (size_t m=0; m<ip; ++m)
+              for (size_t n=0; n<ntrans; ++n)
+                cc2[m+n*ip] = CC(ix[n],m,kx[n]);
+
+            for (size_t n=0; n<ntrans; ++n)
+              {
+              auto i = ix[n];
+              Cmplx<T> *p1=&cc2[n*ip], *p2=ch2;
+              Cmplx<T> *res = nullptr;
+              for(const auto &pass: passes)
+                {
+                res = static_cast<Cmplx<T> *>(pass->exec(tic,
+                  p1, p2, buf2, fwd));
+                if (res==p2) swap (p1,p2);
+                }
+              if (res==&cc2[n*ip]) // no copying necessary
+                {
+                if (i!=0)
+                  {
+                  for (size_t m=1; m<ip; ++m)
+                    cc2[n*ip+m] = cc2[n*ip+m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
+                  }
+                }
+              else
+                {
+                if (i==0)
+                  for (size_t m=0; m<ip; ++m)
+                    cc2[n*ip+m] = res[m];
+                else
+                  {
+                  cc2[n*ip] = res[0];
+                  for (size_t m=1; m<ip; ++m)
+                    cc2[n*ip+m] = res[m].template special_mul<fwd>((*myroots)[rfct*l1*m*i]);
+                  }
+                }
+              }
+            for (size_t m=0; m<ip; ++m)
+              for (size_t n=0; n<ntrans; ++n)
+                CH(ix[n], kx[n], m) = cc2[m+n*ip];
+            }
+          return ch;
+#endif
+          }
+        }
+      }
+
+  public:
+    cfft_multipass(size_t l1_, size_t ido_, size_t ip_,
+      const Troots<Tfs> &roots, bool /*vectorize*/=false)
+      : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false),
+        myroots(roots)
+      {
+      size_t N=ip*l1*ido;
+      rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+
+      // FIXME TBD
+// do we need the vectorize flag at all?
+      size_t lim = 10000; //vectorize ? 10000 : 10000;
+      if (ip<=lim)
+        {
+        auto factors = cfftpass<Tfs>::factorize(ip);
+        size_t l1l=1;
+        for (auto fct: factors)
+          {
+          passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots, false));
+          l1l*=fct;
+          }
+        }
+      else
+        {
+        vector<size_t> packets(2,1);
+        auto factors = util1d::prime_factors(ip);
+        sort(factors.begin(), factors.end(), std::greater<size_t>());
+        for (auto fct: factors)
+          (packets[0]>packets[1]) ? packets[1]*=fct : packets[0]*=fct;
+        size_t l1l=1;
+        for (auto pkt: packets)
+          {
+          passes.push_back(cfftpass<Tfs>::make_pass(l1l, ip/(pkt*l1l), pkt, roots, false));
+          l1l*=pkt;
+          }
+        }
+      for (const auto &pass: passes)
+        {
+        bufsz = max(bufsz, pass->bufsize());
+        need_cpy |= pass->needs_copy();
+        }
+      if ((l1!=1)||(ido!=1))
+        {
+        need_cpy=true;
+        bufsz += (bunchsize+1)*ip;
+        }
+      }
+
+    virtual size_t bufsize() const { return bufsz; }
+    virtual bool needs_copy() const { return need_cpy; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+#undef POCKETFFT_EXEC_DISPATCH
+
+template <size_t vlen, typename Tfs> class cfftp_vecpass: public cfftpass<Tfs>
+  {
+  private:
+    static_assert(simd_exists<Tfs, vlen>, "bad vlen");
+    using typename cfftpass<Tfs>::Tcs;
+    using Tfv=typename simd_select<Tfs, vlen>::type;
+    using Tcv=Cmplx<Tfv>;
+
+    size_t ip;
+    Tcpass<Tfs> spass;
+    Tcpass<Tfs> vpass;
+    size_t bufsz;
+
+    template<bool fwd> Tcs *exec_ (Tcs *cc,
+      Tcs * /*ch*/, Tcs *sbuf, size_t nthreads) const
+      {
+      char *xbuf = reinterpret_cast<char *>(sbuf);
+      size_t misalign = reinterpret_cast<size_t>(xbuf)&(sizeof(Tfv)-1);
+      if (misalign != 0)
+        xbuf += sizeof(Tfv)-misalign;
+      Tcv *buf = reinterpret_cast<Tcv *>(xbuf);
+      auto * cc2 = buf;
+      auto * ch2 = buf+ip/vlen+7;
+      auto * buf2 = buf+2*ip/vlen+7+7;
+      static const auto tics = tidx<Tcs *>();
+// run scalar pass
+      auto res = static_cast<Tcs *>(spass->exec(tics, cc,
+        reinterpret_cast<Tcs *>(ch2), reinterpret_cast<Tcs *>(buf2),
+        fwd, nthreads));
+// arrange input in SIMD-friendly way, must be done out-of-place
+      for (size_t i=0; i<ip/vlen; ++i)
+        {
+        Tcv tmp;
+        for (size_t j=0; j<vlen; ++j)
+          {
+          size_t idx = j*(ip/vlen) + i;
+          tmp.r[j] = res[idx].r;
+          tmp.i[j] = res[idx].i;
+          }
+        cc2[i] = tmp;
+        }
+// run vector pass
+      static const auto ticv = tidx<Tcv *>();
+      auto res2 = static_cast<Tcv *>(vpass->exec(ticv,
+        cc2, ch2, buf2, fwd, nthreads));
+// de-SIMDify, can be done pseudo-inplace
+      for (size_t i=0; i<ip/vlen; ++i)
+        {
+        Tcv tmp = res2[i];
+        for (size_t j=0; j<vlen; ++j)
+          cc[i*vlen+j] = Tcs(tmp.r[j], tmp.i[j]);
+        }
+      return cc;
+      }
+
+  public:
+    cfftp_vecpass(size_t ip_, const Troots<Tfs> &roots)
+      : ip(ip_), spass(cfftpass<Tfs>::make_pass(1, ip/vlen, vlen, roots)),
+        vpass(cfftpass<Tfs>::make_pass(1, 1, ip/vlen, roots)), bufsz(0)
+      {
+      MR_assert((ip/vlen)*vlen==ip, "cannot vectorize this size");
+      bufsz = 2*(ip/vlen)+7+7;
+      bufsz += max(vpass->bufsize(),(spass->bufsize()+vlen-1)/vlen); // buffers for subpasses
+      bufsz *= vlen; // since we specify in terms of Tcs
+      bufsz += vlen; // wiggle room for alignment shifts
+      }
+    virtual size_t bufsize() const { return bufsz; }
+    virtual bool needs_copy() const { return false; }
+    virtual void *exec(const type_index &ti, void *in, void *copy, void *buf,
+      bool fwd, size_t nthreads=1) const
+      {
+      static const auto tics = tidx<Tcs *>();
+      MR_assert(ti==tics, "bad input type");
+      auto in1 = static_cast<Tcs *>(in);
+      auto copy1 = static_cast<Tcs *>(copy);
+      auto buf1 = static_cast<Tcs *>(buf);
+      return fwd ? exec_<true>(in1, copy1, buf1, nthreads)
+                 : exec_<false>(in1, copy1, buf1, nthreads);
+      }
+  };
+
+template<typename Tfs> Tcpass<Tfs> cfftpass<Tfs>::make_pass(size_t l1,
+  size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize)
+  {
+  MR_assert(ip>=1, "no zero-sized FFTs");
+  // do we have an 1D vectorizable FFT?
+  if (vectorize && (ip>300)&& (ip<=100000) && (l1==1) && (ido==1))
+    {
+//    constexpr auto vlen = native_simd<Tfs>::size();
+//    if constexpr(vlen>=4)
+    constexpr auto vlen = 4;
+    if constexpr(simd_exists<Tfs,vlen>)
+      if ((ip&(vlen-1))==0)
+        return make_shared<cfftp_vecpass<vlen,Tfs>>(ip, roots);
+    }
+
+  if (ip==1) return make_shared<cfftp1<Tfs>>();
+  auto factors=cfftpass<Tfs>::factorize(ip);
+  if (factors.size()==1)
+    {
+    switch(ip)
+      {
+      case 2:
+        return make_shared<cfftp2<Tfs>>(l1, ido, roots);
+      case 3:
+        return make_shared<cfftp3<Tfs>>(l1, ido, roots);
+      case 4:
+        return make_shared<cfftp4<Tfs>>(l1, ido, roots);
+      case 5:
+        return make_shared<cfftp5<Tfs>>(l1, ido, roots);
+      case 7:
+        return make_shared<cfftp7<Tfs>>(l1, ido, roots);
+      case 8:
+        return make_shared<cfftp8<Tfs>>(l1, ido, roots);
+      case 11:
+        return make_shared<cfftp11<Tfs>>(l1, ido, roots);
+      default:
+        if (ip<110)
+          return make_shared<cfftpg<Tfs>>(l1, ido, ip, roots);
+        else
+          return make_shared<cfftpblue<Tfs>>(l1, ido, ip, roots, vectorize);
+      }
+    }
+  else // more than one factor, need a multipass
+    return make_shared<cfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize);
+  }
+
+#define POCKETFFT_EXEC_DISPATCH \
+    virtual void *exec(const type_index &ti, void *in, void *copy, void *buf, \
+      bool fwd, size_t nthreads) const \
+      { \
+      static const auto tifs=tidx<Tfs *>(); \
+      if (ti==tifs) \
+        { \
+        auto in1 = static_cast<Tfs *>(in); \
+        auto copy1 = static_cast<Tfs *>(copy); \
+        auto buf1 = static_cast<Tfs *>(buf); \
+        return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                   : exec_<false>(in1, copy1, buf1, nthreads); \
+        } \
+      if constexpr (fft1d_simdlen<Tfs> > 1) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>>::type; \
+          static const auto tifv=tidx<Tfv *>(); \
+          if (ti==tifv) \
+            {  \
+            auto in1 = static_cast<Tfv *>(in); \
+            auto copy1 = static_cast<Tfv *>(copy); \
+            auto buf1 = static_cast<Tfv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 2) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/2>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/2>::type; \
+          static const auto tifv=tidx<Tfv *>(); \
+          if (ti==tifv) \
+            {  \
+            auto in1 = static_cast<Tfv *>(in); \
+            auto copy1 = static_cast<Tfv *>(copy); \
+            auto buf1 = static_cast<Tfv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 4) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/4>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/4>::type; \
+          static const auto tifv=tidx<Tfv *>(); \
+          if (ti==tifv) \
+            {  \
+            auto in1 = static_cast<Tfv *>(in); \
+            auto copy1 = static_cast<Tfv *>(copy); \
+            auto buf1 = static_cast<Tfv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      if constexpr (fft1d_simdlen<Tfs> > 8) \
+        if constexpr (simd_exists<Tfs, fft1d_simdlen<Tfs>/8>) \
+          { \
+          using Tfv = typename simd_select<Tfs, fft1d_simdlen<Tfs>/8>::type; \
+          static const auto tifv=tidx<Tfv *>(); \
+          if (ti==tifv) \
+            {  \
+            auto in1 = static_cast<Tfv *>(in); \
+            auto copy1 = static_cast<Tfv *>(copy); \
+            auto buf1 = static_cast<Tfv *>(buf); \
+            return fwd ? exec_<true>(in1, copy1, buf1, nthreads) \
+                       : exec_<false>(in1, copy1, buf1, nthreads); \
+            } \
+          } \
+      MR_fail("impossible vector length requested"); \
+      }
+
+/* (a+ib) = conj(c+id) * (e+if) */
+template<typename T1, typename T2, typename T3> inline void MULPM
+  (T1 &a, T1 &b, T2 c, T2 d, T3 e, T3 f)
+  {  a=c*e+d*f; b=c*f-d*e; }
+
+template <typename Tfs> class rfftp1: public rfftpass<Tfs>
+  {
+  public:
+    rfftp1() {}
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return false; }
+
+    virtual void *exec(const type_index & /*ti*/, void * in, void * /*copy*/,
+      void * /*buf*/, bool /*fwd*/, size_t /*nthreads*/) const
+      { return in; }
+  };
+
+template <typename Tfs> class rfftp2: public rfftpass<Tfs>
+  {
+  private:
+    size_t l1, ido;
+    static constexpr size_t ip=2;
+    aligned_array<Tfs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i+x*(ido-1)]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
+      {
+      if constexpr(fwd)
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+ip*c)]; };
+        for (size_t k=0; k<l1; k++)
+          PM (CH(0,0,k),CH(ido-1,1,k),CC(0,k,0),CC(0,k,1));
+        if ((ido&1)==0)
+          for (size_t k=0; k<l1; k++)
+            {
+            CH(    0,1,k) = -CC(ido-1,k,1);
+            CH(ido-1,0,k) =  CC(ido-1,k,0);
+            }
+        if (ido<=2) return ch;
+        for (size_t k=0; k<l1; k++)
+          for (size_t i=2; i<ido; i+=2)
+            {
+            size_t ic=ido-i;
+            Tfd tr2, ti2;
+            MULPM (tr2,ti2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
+            PM (CH(i-1,0,k),CH(ic-1,1,k),CC(i-1,k,0),tr2);
+            PM (CH(i  ,0,k),CH(ic  ,1,k),ti2,CC(i  ,k,0));
+            }
+        }
+      else
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          PM (CH(0,k,0),CH(0,k,1),CC(0,0,k),CC(ido-1,1,k));
+        if ((ido&1)==0)
+          for (size_t k=0; k<l1; k++)
+            {
+            CH(ido-1,k,0) = Tfs( 2)*CC(ido-1,0,k);
+            CH(ido-1,k,1) = Tfs(-2)*CC(0    ,1,k);
+            }
+        if (ido<=2) return ch;
+        for (size_t k=0; k<l1;++k)
+          for (size_t i=2; i<ido; i+=2)
+            {
+            size_t ic=ido-i;
+            Tfd ti2, tr2;
+            PM (CH(i-1,k,0),tr2,CC(i-1,0,k),CC(ic-1,1,k));
+            PM (ti2,CH(i  ,k,0),CC(i  ,0,k),CC(ic  ,1,k));
+            MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ti2,tr2);
+            }
+        }
+      return ch;
+      }
+
+  public:
+    rfftp2(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+// a2=a+b; b2=i*(b-a);
+#define POCKETFFT_REARRANGE(rx, ix, ry, iy) \
+  {\
+  auto t1=rx+ry, t2=ry-rx, t3=ix+iy, t4=ix-iy; \
+  rx=t1; ix=t3; ry=t4; iy=t2; \
+  }
+
+template <typename Tfs> class rfftp3: public rfftpass<Tfs>
+  {
+  private:
+    size_t l1, ido;
+    static constexpr size_t ip=3;
+    aligned_array<Tfs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i+x*(ido-1)]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
+      {
+      constexpr Tfs taur=Tfs(-0.5),
+                    taui=Tfs(0.8660254037844386467637231707529362L);
+      if constexpr(fwd)
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+ip*c)]; };
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd cr2=CC(0,k,1)+CC(0,k,2);
+          CH(0,0,k) = CC(0,k,0)+cr2;
+          CH(0,2,k) = taui*(CC(0,k,2)-CC(0,k,1));
+          CH(ido-1,1,k) = CC(0,k,0)+taur*cr2;
+          }
+        if (ido==1) return ch;
+        for (size_t k=0; k<l1; k++)
+          for (size_t i=2; i<ido; i+=2)
+            {
+            size_t ic=ido-i;
+            Tfd di2, di3, dr2, dr3;
+            MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1)); // d2=conj(WA0)*CC1
+            MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2)); // d3=conj(WA1)*CC2
+            POCKETFFT_REARRANGE(dr2, di2, dr3, di3);
+            CH(i-1,0,k) = CC(i-1,k,0)+dr2; // c add
+            CH(i  ,0,k) = CC(i  ,k,0)+di2;
+            Tfd tr2 = CC(i-1,k,0)+taur*dr2; // c add
+            Tfd ti2 = CC(i  ,k,0)+taur*di2;
+            Tfd tr3 = taui*dr3;  // t3 = taui*i*(d3-d2)?
+            Tfd ti3 = taui*di3;
+            PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr3); // PM(i) = t2+t3
+            PM(CH(i  ,2,k),CH(ic  ,1,k),ti3,ti2); // PM(ic) = conj(t2-t3)
+            }
+        }
+      else
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd tr2=Tfs(2)*CC(ido-1,1,k);
+          Tfd cr2=CC(0,0,k)+taur*tr2;
+          CH(0,k,0)=CC(0,0,k)+tr2;
+          Tfd ci3=Tfs(2)*taui*CC(0,2,k);
+          PM (CH(0,k,2),CH(0,k,1),cr2,ci3);
+          }
+        if (ido==1) return ch;
+        for (size_t k=0; k<l1; k++)
+          for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
+            {
+            Tfd tr2=CC(i-1,2,k)+CC(ic-1,1,k); // t2=CC(I) + conj(CC(ic))
+            Tfd ti2=CC(i  ,2,k)-CC(ic  ,1,k);
+            Tfd cr2=CC(i-1,0,k)+taur*tr2;     // c2=CC +taur*t2
+            Tfd ci2=CC(i  ,0,k)+taur*ti2;
+            CH(i-1,k,0)=CC(i-1,0,k)+tr2;         // CH=CC+t2
+            CH(i  ,k,0)=CC(i  ,0,k)+ti2;
+            Tfd cr3=taui*(CC(i-1,2,k)-CC(ic-1,1,k));// c3=taui*(CC(i)-conj(CC(ic)))
+            Tfd ci3=taui*(CC(i  ,2,k)+CC(ic  ,1,k));
+            Tfd di2, di3, dr2, dr3;
+            PM(dr3,dr2,cr2,ci3); // d2= (cr2-ci3, ci2+cr3) = c2+i*c3
+            PM(di2,di3,ci2,cr3); // d3= (cr2+ci3, ci2-cr3) = c2-i*c3
+            MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2); // ch = WA*d2
+            MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
+            }
+        }
+      return ch;
+      }
+
+  public:
+    rfftp3(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      MR_assert(ido&1, "ido must be odd");
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfftp4: public rfftpass<Tfs>
+  {
+  private:
+    size_t l1, ido;
+    static constexpr size_t ip=4;
+    aligned_array<Tfs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i+x*(ido-1)]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
+      {
+      if constexpr(fwd)
+        {
+        constexpr Tfs hsqt2=Tfs(0.707106781186547524400844362104849L);
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+ip*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd tr1,tr2;
+          PM (tr1,CH(0,2,k),CC(0,k,3),CC(0,k,1));
+          PM (tr2,CH(ido-1,1,k),CC(0,k,0),CC(0,k,2));
+          PM (CH(0,0,k),CH(ido-1,3,k),tr2,tr1);
+          }
+        if ((ido&1)==0)
+          for (size_t k=0; k<l1; k++)
+            {
+            Tfd ti1=-hsqt2*(CC(ido-1,k,1)+CC(ido-1,k,3));
+            Tfd tr1= hsqt2*(CC(ido-1,k,1)-CC(ido-1,k,3));
+            PM (CH(ido-1,0,k),CH(ido-1,2,k),CC(ido-1,k,0),tr1);
+            PM (CH(    0,3,k),CH(    0,1,k),ti1,CC(ido-1,k,2));
+            }
+        if (ido<=2) return ch;
+        for (size_t k=0; k<l1; k++)
+          for (size_t i=2; i<ido; i+=2)
+            {
+            size_t ic=ido-i;
+            Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+            MULPM(cr2,ci2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
+            MULPM(cr3,ci3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
+            MULPM(cr4,ci4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
+            PM(tr1,tr4,cr4,cr2);
+            PM(ti1,ti4,ci2,ci4);
+            PM(tr2,tr3,CC(i-1,k,0),cr3);
+            PM(ti2,ti3,CC(i  ,k,0),ci3);
+            PM(CH(i-1,0,k),CH(ic-1,3,k),tr2,tr1);
+            PM(CH(i  ,0,k),CH(ic  ,3,k),ti1,ti2);
+            PM(CH(i-1,2,k),CH(ic-1,1,k),tr3,ti4);
+            PM(CH(i  ,2,k),CH(ic  ,1,k),tr4,ti3);
+            }
+        }
+      else
+        {
+        constexpr Tfs sqrt2=Tfs(1.414213562373095048801688724209698L);
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd tr1, tr2;
+          PM (tr2,tr1,CC(0,0,k),CC(ido-1,3,k));
+          Tfd tr3=Tfs(2)*CC(ido-1,1,k);
+          Tfd tr4=Tfs(2)*CC(0,2,k);
+          PM (CH(0,k,0),CH(0,k,2),tr2,tr3);
+          PM (CH(0,k,3),CH(0,k,1),tr1,tr4);
+          }
+        if ((ido&1)==0)
+          for (size_t k=0; k<l1; k++)
+            {
+            Tfd tr1,tr2,ti1,ti2;
+            PM (ti1,ti2,CC(0    ,3,k),CC(0    ,1,k));
+            PM (tr2,tr1,CC(ido-1,0,k),CC(ido-1,2,k));
+            CH(ido-1,k,0)=tr2+tr2;
+            CH(ido-1,k,1)=sqrt2*(tr1-ti1);
+            CH(ido-1,k,2)=ti2+ti2;
+            CH(ido-1,k,3)=-sqrt2*(tr1+ti1);
+            }
+        if (ido<=2) return ch;
+        for (size_t k=0; k<l1;++k)
+          for (size_t i=2; i<ido; i+=2)
+            {
+            Tfd ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+            size_t ic=ido-i;
+            PM (tr2,tr1,CC(i-1,0,k),CC(ic-1,3,k));
+            PM (ti1,ti2,CC(i  ,0,k),CC(ic  ,3,k));
+            PM (tr4,ti3,CC(i  ,2,k),CC(ic  ,1,k));
+            PM (tr3,ti4,CC(i-1,2,k),CC(ic-1,1,k));
+            PM (CH(i-1,k,0),cr3,tr2,tr3);
+            PM (CH(i  ,k,0),ci3,ti2,ti3);
+            PM (cr4,cr2,tr1,tr4);
+            PM (ci2,ci4,ti1,ti4);
+            MULPM (CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),ci2,cr2);
+            MULPM (CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),ci3,cr3);
+            MULPM (CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),ci4,cr4);
+            }
+        }
+      return ch;
+      }
+
+  public:
+    rfftp4(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfftp5: public rfftpass<Tfs>
+  {
+  private:
+    size_t l1, ido;
+    static constexpr size_t ip=5;
+    aligned_array<Tfs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i+x*(ido-1)]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
+      {
+      constexpr Tfs tr11= Tfs(0.3090169943749474241022934171828191L),
+                    ti11= Tfs(0.9510565162951535721164393333793821L),
+                    tr12= Tfs(-0.8090169943749474241022934171828191L),
+                    ti12= Tfs(0.5877852522924731291687059546390728L);
+
+      if constexpr(fwd)
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+ip*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd cr2, cr3, ci4, ci5;
+          PM (cr2,ci5,CC(0,k,4),CC(0,k,1));
+          PM (cr3,ci4,CC(0,k,3),CC(0,k,2));
+          CH(0,0,k)=CC(0,k,0)+cr2+cr3;
+          CH(ido-1,1,k)=CC(0,k,0)+tr11*cr2+tr12*cr3;
+          CH(0,2,k)=ti11*ci5+ti12*ci4;
+          CH(ido-1,3,k)=CC(0,k,0)+tr12*cr2+tr11*cr3;
+          CH(0,4,k)=ti12*ci5-ti11*ci4;
+          }
+        if (ido==1) return ch;
+        for (size_t k=0; k<l1;++k)
+          for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
+            {
+            Tfd di2, di3, di4, di5, dr2, dr3, dr4, dr5;
+            MULPM (dr2,di2,WA(0,i-2),WA(0,i-1),CC(i-1,k,1),CC(i,k,1));
+            MULPM (dr3,di3,WA(1,i-2),WA(1,i-1),CC(i-1,k,2),CC(i,k,2));
+            MULPM (dr4,di4,WA(2,i-2),WA(2,i-1),CC(i-1,k,3),CC(i,k,3));
+            MULPM (dr5,di5,WA(3,i-2),WA(3,i-1),CC(i-1,k,4),CC(i,k,4));
+            POCKETFFT_REARRANGE(dr2, di2, dr5, di5);
+            POCKETFFT_REARRANGE(dr3, di3, dr4, di4);
+            CH(i-1,0,k)=CC(i-1,k,0)+dr2+dr3;
+            CH(i  ,0,k)=CC(i  ,k,0)+di2+di3;
+            Tfd tr2=CC(i-1,k,0)+tr11*dr2+tr12*dr3;
+            Tfd ti2=CC(i  ,k,0)+tr11*di2+tr12*di3;
+            Tfd tr3=CC(i-1,k,0)+tr12*dr2+tr11*dr3;
+            Tfd ti3=CC(i  ,k,0)+tr12*di2+tr11*di3;
+            Tfd tr5 = ti11*dr5 + ti12*dr4;
+            Tfd ti5 = ti11*di5 + ti12*di4;
+            Tfd tr4 = ti12*dr5 - ti11*dr4;
+            Tfd ti4 = ti12*di5 - ti11*di4;
+            PM(CH(i-1,2,k),CH(ic-1,1,k),tr2,tr5);
+            PM(CH(i  ,2,k),CH(ic  ,1,k),ti5,ti2);
+            PM(CH(i-1,4,k),CH(ic-1,3,k),tr3,tr4);
+            PM(CH(i  ,4,k),CH(ic  ,3,k),ti4,ti3);
+            }
+        }
+      else
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          Tfd ti5=CC(0,2,k)+CC(0,2,k);
+          Tfd ti4=CC(0,4,k)+CC(0,4,k);
+          Tfd tr2=CC(ido-1,1,k)+CC(ido-1,1,k);
+          Tfd tr3=CC(ido-1,3,k)+CC(ido-1,3,k);
+          CH(0,k,0)=CC(0,0,k)+tr2+tr3;
+          Tfd cr2=CC(0,0,k)+tr11*tr2+tr12*tr3;
+          Tfd cr3=CC(0,0,k)+tr12*tr2+tr11*tr3;
+          Tfd ci4, ci5;
+          MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
+          PM(CH(0,k,4),CH(0,k,1),cr2,ci5);
+          PM(CH(0,k,3),CH(0,k,2),cr3,ci4);
+          }
+        if (ido==1) return ch;
+        for (size_t k=0; k<l1;++k)
+          for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
+            {
+            Tfd tr2, tr3, tr4, tr5, ti2, ti3, ti4, ti5;
+            PM(tr2,tr5,CC(i-1,2,k),CC(ic-1,1,k));
+            PM(ti5,ti2,CC(i  ,2,k),CC(ic  ,1,k));
+            PM(tr3,tr4,CC(i-1,4,k),CC(ic-1,3,k));
+            PM(ti4,ti3,CC(i  ,4,k),CC(ic  ,3,k));
+            CH(i-1,k,0)=CC(i-1,0,k)+tr2+tr3;
+            CH(i  ,k,0)=CC(i  ,0,k)+ti2+ti3;
+            Tfd cr2=CC(i-1,0,k)+tr11*tr2+tr12*tr3;
+            Tfd ci2=CC(i  ,0,k)+tr11*ti2+tr12*ti3;
+            Tfd cr3=CC(i-1,0,k)+tr12*tr2+tr11*tr3;
+            Tfd ci3=CC(i  ,0,k)+tr12*ti2+tr11*ti3;
+            Tfd ci4, ci5, cr5, cr4;
+            MULPM(cr5,cr4,tr5,tr4,ti11,ti12);
+            MULPM(ci5,ci4,ti5,ti4,ti11,ti12);
+            Tfd dr2, dr3, dr4, dr5, di2, di3, di4, di5;
+            PM(dr4,dr3,cr3,ci4);
+            PM(di3,di4,ci3,cr4);
+            PM(dr5,dr2,cr2,ci5);
+            PM(di2,di5,ci2,cr5);
+            MULPM(CH(i,k,1),CH(i-1,k,1),WA(0,i-2),WA(0,i-1),di2,dr2);
+            MULPM(CH(i,k,2),CH(i-1,k,2),WA(1,i-2),WA(1,i-1),di3,dr3);
+            MULPM(CH(i,k,3),CH(i-1,k,3),WA(2,i-2),WA(2,i-1),di4,dr4);
+            MULPM(CH(i,k,4),CH(i-1,k,4),WA(3,i-2),WA(3,i-1),di5,dr5);
+            }
+        }
+      return ch;
+      }
+
+  public:
+    rfftp5(size_t l1_, size_t ido_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), wa((ip-1)*(ido-1))
+      {
+      MR_assert(ido&1, "ido must be odd");
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfftpg: public rfftpass<Tfs>
+  {
+  private:
+    size_t l1, ido;
+    size_t ip;
+    aligned_array<Tfs> wa, csarr;
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * /*buf*/, size_t /*nthreads*/) const
+      {
+      if constexpr(fwd)
+        {
+        size_t ipph=(ip+1)/2;
+        size_t idl1 = ido*l1;
+
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+        auto C1 = [cc,this] (size_t a, size_t b, size_t c) -> Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto C2 = [cc,idl1] (size_t a, size_t b) -> Tfd&
+          { return cc[a+idl1*b]; };
+        auto CH2 = [ch,idl1] (size_t a, size_t b) -> Tfd&
+          { return ch[a+idl1*b]; };
+
+        if (ido>1)
+          {
+          for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)              // 114
+            {
+            size_t is=(j-1)*(ido-1),
+                   is2=(jc-1)*(ido-1);
+            for (size_t k=0; k<l1; ++k)                            // 113
+              {
+              size_t idij=is;
+              size_t idij2=is2;
+              for (size_t i=1; i<=ido-2; i+=2)                      // 112
+                {
+                Tfd t1=C1(i,k,j ), t2=C1(i+1,k,j ),
+                    t3=C1(i,k,jc), t4=C1(i+1,k,jc);
+                Tfd x1=wa[idij]*t1 + wa[idij+1]*t2,
+                    x2=wa[idij]*t2 - wa[idij+1]*t1,
+                    x3=wa[idij2]*t3 + wa[idij2+1]*t4,
+                    x4=wa[idij2]*t4 - wa[idij2+1]*t3;
+                PM(C1(i,k,j),C1(i+1,k,jc),x3,x1);
+                PM(C1(i+1,k,j),C1(i,k,jc),x2,x4);
+                idij+=2;
+                idij2+=2;
+                }
+              }
+            }
+          }
+
+        for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 123
+          for (size_t k=0; k<l1; ++k)                              // 122
+            MPINPLACE(C1(0,k,jc), C1(0,k,j));
+
+      //everything in C
+      //memset(ch,0,ip*l1*ido*sizeof(double));
+
+        for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)                 // 127
+          {
+          for (size_t ik=0; ik<idl1; ++ik)                         // 124
+            {
+            CH2(ik,l ) = C2(ik,0)+csarr[2*l]*C2(ik,1)+csarr[4*l]*C2(ik,2);
+            CH2(ik,lc) = csarr[2*l+1]*C2(ik,ip-1)+csarr[4*l+1]*C2(ik,ip-2);
+            }
+          size_t iang = 2*l;
+          size_t j=3, jc=ip-3;
+          for (; j<ipph-3; j+=4,jc-=4)              // 126
+            {
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)                       // 125
+              {
+              CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1)
+                           +ar3*C2(ik,j +2)+ar4*C2(ik,j +3);
+              CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1)
+                           +ai3*C2(ik,jc-2)+ai4*C2(ik,jc-3);
+              }
+            }
+          for (; j<ipph-1; j+=2,jc-=2)              // 126
+            {
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)                       // 125
+              {
+              CH2(ik,l ) += ar1*C2(ik,j )+ar2*C2(ik,j +1);
+              CH2(ik,lc) += ai1*C2(ik,jc)+ai2*C2(ik,jc-1);
+              }
+            }
+          for (; j<ipph; ++j,--jc)              // 126
+            {
+            iang+=l; if (iang>=ip) iang-=ip;
+            Tfs ar=csarr[2*iang], ai=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)                       // 125
+              {
+              CH2(ik,l ) += ar*C2(ik,j );
+              CH2(ik,lc) += ai*C2(ik,jc);
+              }
+            }
+          }
+        for (size_t ik=0; ik<idl1; ++ik)                         // 101
+          CH2(ik,0) = C2(ik,0);
+        for (size_t j=1; j<ipph; ++j)                              // 129
+          for (size_t ik=0; ik<idl1; ++ik)                         // 128
+            CH2(ik,0) += C2(ik,j);
+
+      // everything in CH at this point!
+      //memset(cc,0,ip*l1*ido*sizeof(double));
+
+        for (size_t k=0; k<l1; ++k)                                // 131
+          for (size_t i=0; i<ido; ++i)                             // 130
+            CC(i,0,k) = CH(i,k,0);
+
+        for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 137
+          {
+          size_t j2=2*j-1;
+          for (size_t k=0; k<l1; ++k)                              // 136
+            {
+            CC(ido-1,j2,k) = CH(0,k,j);
+            CC(0,j2+1,k) = CH(0,k,jc);
+            }
+          }
+
+        if (ido==1) return cc;
+
+        for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)                // 140
+          {
+          size_t j2=2*j-1;
+          for(size_t k=0; k<l1; ++k)                               // 139
+            for(size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 138
+              {
+              CC(i   ,j2+1,k) = CH(i  ,k,j )+CH(i  ,k,jc);
+              CC(ic  ,j2  ,k) = CH(i  ,k,j )-CH(i  ,k,jc);
+              CC(i+1 ,j2+1,k) = CH(i+1,k,j )+CH(i+1,k,jc);
+              CC(ic+1,j2  ,k) = CH(i+1,k,jc)-CH(i+1,k,j );
+              }
+          }
+        return cc;
+        }
+      else
+        {
+        size_t ipph=(ip+1)/ 2;
+        size_t idl1 = ido*l1;
+
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+        auto C1 = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto C2 = [cc,idl1](size_t a, size_t b) -> Tfd&
+          { return cc[a+idl1*b]; };
+        auto CH2 = [ch,idl1](size_t a, size_t b) -> Tfd&
+          { return ch[a+idl1*b]; };
+
+        for (size_t k=0; k<l1; ++k)        // 102
+          for (size_t i=0; i<ido; ++i)     // 101
+            CH(i,k,0) = CC(i,0,k);
+        for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)   // 108
+          {
+          size_t j2=2*j-1;
+          for (size_t k=0; k<l1; ++k)
+            {
+            CH(0,k,j ) = Tfs(2)*CC(ido-1,j2,k);
+            CH(0,k,jc) = Tfs(2)*CC(0,j2+1,k);
+            }
+          }
+
+        if (ido!=1)
+          {
+          for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 111
+            {
+            size_t j2=2*j-1;
+            for (size_t k=0; k<l1; ++k)
+              for (size_t i=1, ic=ido-i-2; i<=ido-2; i+=2, ic-=2)      // 109
+                {
+                CH(i  ,k,j ) = CC(i  ,j2+1,k)+CC(ic  ,j2,k);
+                CH(i  ,k,jc) = CC(i  ,j2+1,k)-CC(ic  ,j2,k);
+                CH(i+1,k,j ) = CC(i+1,j2+1,k)-CC(ic+1,j2,k);
+                CH(i+1,k,jc) = CC(i+1,j2+1,k)+CC(ic+1,j2,k);
+                }
+            }
+          }
+        for (size_t l=1,lc=ip-1; l<ipph; ++l,--lc)
+          {
+          for (size_t ik=0; ik<idl1; ++ik)
+            {
+            C2(ik,l ) = CH2(ik,0)+csarr[2*l]*CH2(ik,1)+csarr[4*l]*CH2(ik,2);
+            C2(ik,lc) = csarr[2*l+1]*CH2(ik,ip-1)+csarr[4*l+1]*CH2(ik,ip-2);
+            }
+          size_t iang=2*l;
+          size_t j=3,jc=ip-3;
+          for(; j<ipph-3; j+=4,jc-=4)
+            {
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar3=csarr[2*iang], ai3=csarr[2*iang+1];
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar4=csarr[2*iang], ai4=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)
+              {
+              C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1)
+                          +ar3*CH2(ik,j +2)+ar4*CH2(ik,j +3);
+              C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1)
+                          +ai3*CH2(ik,jc-2)+ai4*CH2(ik,jc-3);
+              }
+            }
+          for(; j<ipph-1; j+=2,jc-=2)
+            {
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar1=csarr[2*iang], ai1=csarr[2*iang+1];
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs ar2=csarr[2*iang], ai2=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)
+              {
+              C2(ik,l ) += ar1*CH2(ik,j )+ar2*CH2(ik,j +1);
+              C2(ik,lc) += ai1*CH2(ik,jc)+ai2*CH2(ik,jc-1);
+              }
+            }
+          for(; j<ipph; ++j,--jc)
+            {
+            iang+=l; if(iang>ip) iang-=ip;
+            Tfs war=csarr[2*iang], wai=csarr[2*iang+1];
+            for (size_t ik=0; ik<idl1; ++ik)
+              {
+              C2(ik,l ) += war*CH2(ik,j );
+              C2(ik,lc) += wai*CH2(ik,jc);
+              }
+            }
+          }
+        for (size_t j=1; j<ipph; ++j)
+          for (size_t ik=0; ik<idl1; ++ik)
+            CH2(ik,0) += CH2(ik,j);
+        for (size_t j=1, jc=ip-1; j<ipph; ++j,--jc)   // 124
+          for (size_t k=0; k<l1; ++k)
+            PM(CH(0,k,jc),CH(0,k,j),C1(0,k,j),C1(0,k,jc));
+
+        if (ido==1) return ch;
+
+        for (size_t j=1, jc=ip-1; j<ipph; ++j, --jc)  // 127
+          for (size_t k=0; k<l1; ++k)
+            for (size_t i=1; i<=ido-2; i+=2)
+              {
+              CH(i  ,k,j ) = C1(i  ,k,j)-C1(i+1,k,jc);
+              CH(i  ,k,jc) = C1(i  ,k,j)+C1(i+1,k,jc);
+              CH(i+1,k,j ) = C1(i+1,k,j)+C1(i  ,k,jc);
+              CH(i+1,k,jc) = C1(i+1,k,j)-C1(i  ,k,jc);
+              }
+
+      // All in CH
+
+        for (size_t j=1; j<ip; ++j)
+          {
+          size_t is = (j-1)*(ido-1);
+          for (size_t k=0; k<l1; ++k)
+            {
+            size_t idij = is;
+            for (size_t i=1; i<=ido-2; i+=2)
+              {
+              Tfd t1=CH(i,k,j), t2=CH(i+1,k,j);
+              CH(i  ,k,j) = wa[idij]*t1-wa[idij+1]*t2;
+              CH(i+1,k,j) = wa[idij]*t2+wa[idij+1]*t1;
+              idij+=2;
+              }
+            }
+          }
+        return ch;
+        }
+      }
+
+  public:
+    rfftpg(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots)
+      : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)), csarr(2*ip)
+      {
+      MR_assert(ido&1, "ido must be odd");
+      size_t N=ip*l1*ido;
+      size_t rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      csarr[0] = Tfs(1);
+      csarr[1] = Tfs(0);
+      for (size_t i=2, ic=2*ip-2; i<=ic; i+=2, ic-=2)
+        {
+        auto val = (*roots)[i/2*rfct*(N/ip)];
+        csarr[i   ] = val.r;
+        csarr[i +1] = val.i;
+        csarr[ic  ] = val.r;
+        csarr[ic+1] = -val.i;
+        }
+      }
+
+    virtual size_t bufsize() const { return 0; }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfftpblue: public rfftpass<Tfs>
+  {
+  private:
+    const size_t l1, ido, ip;
+    aligned_array<Tfs> wa;
+    const Tcpass<Tfs> cplan;
+    size_t bufsz;
+    bool need_cpy;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[i+x*(ido-1)]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_
+      (Tfd * DUCC0_RESTRICT cc, Tfd * DUCC0_RESTRICT ch,
+       Tfd * DUCC0_RESTRICT buf_, size_t nthreads) const
+      {
+      using Tcd = Cmplx<Tfd>;
+      auto buf = reinterpret_cast<Tcd *>(buf_);
+      Tcd *cc2 = &buf[0];
+      Tcd *ch2 = &buf[ip];
+      Tcd *subbuf = &buf[2*ip];
+      static const auto ticd = tidx<Tcd *>();
+
+      if constexpr(fwd)
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> const Tfd&
+          { return cc[a+ido*(b+l1*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+ip*c)]; };
+
+        for (size_t k=0; k<l1; ++k)
+          {
+          // copy in
+          for (size_t m=0; m<ip; ++m)
+            cc2[m] = {CC(0,k,m),Tfd(0)};
+          auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
+            subbuf, fwd, nthreads));
+          // copy out
+          CH(0,0,k) = res[0].r;
+          for (size_t m=1; m<=ip/2; ++m)
+            {
+            CH(ido-1,2*m-1,k)=res[m].r;
+            CH(0,2*m,k)=res[m].i;
+            }
+          }
+        if (ido==1) return ch;
+        size_t ipph = (ip+1)/2;
+        for (size_t k=0; k<l1; ++k)
+          for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
+            {
+            // copy in
+            cc2[0] = {CC(i-1,k,0),CC(i,k,0)};
+            for (size_t m=1; m<ipph; ++m)
+              {
+              MULPM (cc2[m].r,cc2[m].i,WA(m-1,i-2),WA(m-1,i-1),CC(i-1,k,m),CC(i,k,m));
+              MULPM (cc2[ip-m].r,cc2[ip-m].i,WA(ip-m-1,i-2),WA(ip-m-1,i-1),CC(i-1,k,ip-m),CC(i,k,ip-m));
+              }
+            auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
+              subbuf, fwd, nthreads));
+            CH(i-1,0,k) = res[0].r;
+            CH(i,0,k) = res[0].i;
+            for (size_t m=1; m<ipph; ++m)
+              {
+              CH(i-1,2*m,k) = res[m].r;
+              CH(ic-1,2*m-1,k) = res[ip-m].r;
+              CH(i  ,2*m,k) = res[m].i;
+              CH(ic  ,2*m-1,k) = -res[ip-m].i;
+              }
+            }
+        }
+      else
+        {
+        auto CC = [cc,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return cc[a+ido*(b+ip*c)]; };
+        auto CH = [ch,this](size_t a, size_t b, size_t c) -> Tfd&
+          { return ch[a+ido*(b+l1*c)]; };
+
+        for (size_t k=0; k<l1; k++)
+          {
+          cc2[0] = {CC(0,0,k), Tfd(0)};
+          for (size_t m=1; m<=ip/2; ++m)
+            {
+            cc2[m] = {CC(ido-1,2*m-1,k),CC(0,2*m,k)};
+            cc2[ip-m] = {CC(ido-1,2*m-1,k),-CC(0,2*m,k)};
+            }
+          auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
+            subbuf, fwd, nthreads));
+          for (size_t m=0; m<ip; ++m)
+            CH(0,k,m) = res[m].r;
+          }
+        if (ido==1) return ch;
+        for (size_t k=0; k<l1; ++k)
+          for (size_t i=2, ic=ido-2; i<ido; i+=2, ic-=2)
+            {
+            // copy in
+            cc2[0] = {CC(i-1,0,k),CC(i,0,k)};
+            for (size_t m=1; m<=ip/2; ++m)
+              {
+              cc2[m] = {CC(i-1,2*m,k),CC(i,2*m,k)};
+              cc2[ip-m] = {CC(ic-1,2*m-1,k),-CC(ic,2*m-1,k)};
+              }
+            auto res = static_cast<Tcd *>(cplan->exec(ticd, cc2, ch2,
+              subbuf, fwd, nthreads));
+            CH(i-1,k,0) = res[0].r;
+            CH(i,k,0) = res[0].i;
+            for (size_t m=1; m<ip; ++m)
+              {
+              MULPM(CH(i-1,k,m),CH(i,k,m),WA(m-1,i-2),-WA(m-1,i-1),res[m].r,res[m].i);
+              MULPM(CH(i-1,k,ip-m),CH(i,k,ip-m),WA(ip-m-1,i-2),-WA(ip-m-1,i-1),res[ip-m].r,res[ip-m].i);
+              }
+            }
+        }
+      return ch;
+      }
+
+  public:
+    rfftpblue(size_t l1_, size_t ido_, size_t ip_, const Troots<Tfs> &roots, bool vectorize=false)
+      : l1(l1_), ido(ido_), ip(ip_), wa((ip-1)*(ido-1)),
+        cplan(cfftpass<Tfs>::make_pass(1,1,ip,roots,vectorize))
+      {
+      MR_assert(ip&1, "Bluestein length must be odd");
+      MR_assert(ido&1, "ido must be odd");
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+      }
+
+    virtual size_t bufsize() const { return 4*ip + 2*cplan->bufsize(); }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfft_multipass: public rfftpass<Tfs>
+  {
+  private:
+    const size_t l1, ido;
+    size_t ip;
+    vector<Trpass<Tfs>> passes;
+    size_t bufsz;
+    bool need_cpy;
+    aligned_array<Tfs> wa;
+
+    auto WA(size_t x, size_t i) const
+      { return wa[(i-1)*(ip-1)+x]; }
+
+    template<bool fwd, typename Tfd> Tfd *exec_(Tfd *cc, Tfd *ch, Tfd *buf,
+      size_t nthreads) const
+      {
+      static const auto tifd = tidx<Tfd *>();
+      if ((l1==1) && (ido==1))
+        {
+        Tfd *p1=cc, *p2=ch;
+        if constexpr (fwd)
+          for (auto it=passes.rbegin(); it!=passes.rend(); ++it)
+            {
+            auto res = static_cast<Tfd *>((*it)->exec(tifd,
+              p1, p2, buf, fwd, nthreads));
+            if (res==p2) swap(p1,p2);
+            }
+        else
+          for (const auto &pass: passes)
+            {
+            auto res = static_cast<Tfd *>(pass->exec(tifd,
+              p1, p2, buf, fwd, nthreads));
+            if (res==p2) swap(p1,p2);
+            }
+        return p1;
+        }
+      else
+        MR_fail("not yet supported");
+      }
+
+  public:
+    rfft_multipass(size_t l1_, size_t ido_, size_t ip_,
+      const Troots<Tfs> &roots, bool /*vectorize*/=false)
+      : l1(l1_), ido(ido_), ip(ip_), bufsz(0), need_cpy(false),
+        wa((ip-1)*(ido-1))
+      {
+      size_t N=ip*l1*ido;
+      auto rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      for (size_t j=1; j<ip; ++j)
+        for (size_t i=1; i<=(ido-1)/2; ++i)
+          {
+          auto val = (*roots)[rfct*j*l1*i];
+          wa[(j-1)*(ido-1)+2*i-2] = val.r;
+          wa[(j-1)*(ido-1)+2*i-1] = val.i;
+          }
+
+      auto factors = rfftpass<Tfs>::factorize(ip);
+
+      size_t l1l=1;
+      for (auto fct: factors)
+        {
+        passes.push_back(rfftpass<Tfs>::make_pass(l1l, ip/(fct*l1l), fct, roots));
+        l1l*=fct;
+        }
+      for (const auto &pass: passes)
+        {
+        bufsz = max(bufsz, pass->bufsize());
+        need_cpy |= pass->needs_copy();
+        }
+      if ((l1!=1)||(ido!=1))
+        {
+        need_cpy=true;
+        bufsz += 2*ip;
+        }
+      }
+
+    virtual size_t bufsize() const { return bufsz; }
+    virtual bool needs_copy() const { return need_cpy; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+
+template <typename Tfs> class rfftp_complexify: public rfftpass<Tfs>
+  {
+  private:
+    size_t N;
+    Troots<Tfs> roots;
+    size_t rfct;
+    Tcpass<Tfs> pass;
+    size_t l1, ido;
+    static constexpr size_t ip=2;
+
+    template<bool fwd, typename Tfd> Tfd *exec_ (Tfd * DUCC0_RESTRICT cc,
+      Tfd * DUCC0_RESTRICT ch, Tfd * buf, size_t nthreads) const
+      {
+      using Tcd = Cmplx<Tfd>;
+      auto ccc = reinterpret_cast<Tcd *>(cc);
+      auto cch = reinterpret_cast<Tcd *>(ch);
+      auto cbuf = reinterpret_cast<Tcd *>(buf);
+      static const auto ticd = tidx<Tcd *>();
+      if constexpr(fwd)
+        {
+        auto res = static_cast<Tcd *>(pass->exec(ticd,
+          ccc, cch, cbuf, true, nthreads));
+        auto rres = (res==ccc) ? ch : cc;
+        rres[0] = res[0].r+res[0].i;
+//FIXME: parallelize?
+        for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi)
+          {
+          auto xe = res[i]+res[xi].conj();
+          auto xo = Tcd(res[i].i+res[xi].i, res[xi].r-res[i].r)
+                  * (*roots)[rfct*i].conj();
+          rres[2*i-1] = Tfs(0.5)*(xe.r+xo.r);
+          rres[2*i] = Tfs(0.5)*(xe.i+xo.i);
+          rres[2*xi-1] = Tfs(0.5)*(xe.r-xo.r);
+          rres[2*xi] = Tfs(0.5)*(xo.i-xe.i);
+          }
+        rres[N-1] = res[0].r-res[0].i;
+        return rres;
+        }
+      else
+        {
+        cch[0] = Tcd(cc[0]+cc[N-1], cc[0]-cc[N-1]);
+//FIXME: parallelize?
+        for (size_t i=1, xi=N/2-1; i<=xi; ++i, --xi)
+          {
+          Tcd t1 (cc[2*i-1], cc[2*i]);
+          Tcd t2 (cc[2*xi-1], -cc[2*xi]);
+          auto xe = t1+t2;
+          auto xo = (t1-t2)*(*roots)[rfct*i];
+          cch[i] = (xe + Tcd(-xo.i, xo.r));
+          cch[xi] = (xe.conj() + Tcd(xo.i, xo.r));
+          }
+        auto res = static_cast<Tcd *>(pass->exec(ticd,
+          cch, ccc, cbuf, false, nthreads));
+        return (res==ccc) ? cc : ch;
+        }
+      }
+
+  public:
+    rfftp_complexify(size_t N_, const Troots<Tfs> &roots_, bool vectorize=false)
+      : N(N_), roots(roots_), pass(cfftpass<Tfs>::make_pass(N/2, vectorize))
+      {
+      rfct = roots->size()/N;
+      MR_assert(roots->size()==N*rfct, "mismatch");
+      MR_assert((N&1)==0, "N must be even");
+      }
+
+    virtual size_t bufsize() const { return 2*pass->bufsize(); }
+    virtual bool needs_copy() const { return true; }
+
+    POCKETFFT_EXEC_DISPATCH
+  };
+#undef POCKETFFT_EXEC_DISPATCH
+
+template<typename Tfs> Trpass<Tfs> rfftpass<Tfs>::make_pass(size_t l1,
+  size_t ido, size_t ip, const Troots<Tfs> &roots, bool vectorize)
+  {
+  MR_assert(ip>=1, "no zero-sized FFTs");
+  if (ip==1) return make_shared<rfftp1<Tfs>>();
+  if ((ip>1000) && ((ip&1)==0))  // use complex transform
+    {
+    bool doit = vectorize&&((ip&7)==0);  // vecpass might be beneficial
+    doit |= ip>10000;  // complex multipass might be beneficial
+    if (!doit)
+      {
+      auto factors = rfftpass<Tfs>::factorize(ip);
+      for (auto factor: factors)
+        // complex Bluestein or larger prime factor functions might be beneficial
+        if (factor>5) { doit=true; break; }
+      }
+    if (doit)
+      return make_shared<rfftp_complexify<Tfs>>(ip, roots, vectorize);
+    }
+  auto factors=rfftpass<Tfs>::factorize(ip);
+  if (factors.size()==1)
+    {
+    switch(ip)
+      {
+      case 2:
+        return make_shared<rfftp2<Tfs>>(l1, ido, roots);
+      case 3:
+        return make_shared<rfftp3<Tfs>>(l1, ido, roots);
+      case 4:
+        return make_shared<rfftp4<Tfs>>(l1, ido, roots);
+      case 5:
+        return make_shared<rfftp5<Tfs>>(l1, ido, roots);
+      default:
+        if (ip<135)
+          return make_shared<rfftpg<Tfs>>(l1, ido, ip, roots);
+        else
+          return make_shared<rfftpblue<Tfs>>(l1, ido, ip, roots, vectorize);
+      }
+    }
+  else // more than one factor, need a multipass
+    return make_shared<rfft_multipass<Tfs>>(l1, ido, ip, roots, vectorize);
+  }
+
+}}
+
+#endif
diff --git a/contrib/ducc0/fft/fftnd_impl.h b/contrib/ducc0/fft/fftnd_impl.h
new file mode 100644
index 000000000..1ab98027d
--- /dev/null
+++ b/contrib/ducc0/fft/fftnd_impl.h
@@ -0,0 +1,1828 @@
+/*
+This file is part of the ducc FFT library.
+
+Copyright (C) 2010-2023 Max-Planck-Society
+Copyright (C) 2019 Peter Bell
+
+Authors: Martin Reinecke, Peter Bell
+*/
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_FFTND_IMPL_H
+#define DUCC0_FFTND_IMPL_H
+
+#include <cmath>
+#include <cstddef>
+#include <cstdlib>
+#include <numeric>
+#include <stdexcept>
+#include <memory>
+#include <vector>
+#include <complex>
+#include <algorithm>
+#include "ducc0/infra/useful_macros.h"
+#include "ducc0/infra/error_handling.h"
+#include "ducc0/infra/threading.h"
+#include "ducc0/infra/misc_utils.h"
+#include "ducc0/infra/simd.h"
+#include "ducc0/infra/mav.h"
+#include "ducc0/infra/aligned_array.h"
+#include "ducc0/math/cmplx.h"
+#include "ducc0/math/unity_roots.h"
+#include "ducc0/fft/fft1d_impl.h"
+
+/** \file fft.h
+ *  Implementation of multi-dimensional Fast Fourier and related transforms
+ *  \copyright Copyright (C) 2010-2021 Max-Planck-Society
+ *  \copyright Copyright (C) 2019 Peter Bell
+ *  \copyright
+ *  \copyright For the odd-sized DCT-IV transforms:
+ *  \copyright   Copyright (C) 2003, 2007-14 Matteo Frigo
+ *  \copyright   Copyright (C) 2003, 2007-14 Massachusetts Institute of Technology
+ *
+ * \authors Martin Reinecke, Peter Bell
+ */
+
+namespace ducc0 {
+
+namespace detail_fft {
+
+// the next line is necessary to address some sloppy name choices in hipSYCL
+using std::min, std::max;
+
+template<typename T> constexpr inline size_t fft_simdlen
+  = min<size_t>(8, native_simd<T>::size());
+template<> constexpr inline size_t fft_simdlen<double>
+  = min<size_t>(4, native_simd<double>::size());
+template<> constexpr inline size_t fft_simdlen<float>
+  = min<size_t>(8, native_simd<float>::size());
+template<typename T> using fft_simd = typename simd_select<T,fft_simdlen<T>>::type;
+template<typename T> constexpr inline bool fft_simd_exists = (fft_simdlen<T> > 1);
+
+struct util // hack to avoid duplicate symbols
+  {
+  static void sanity_check_axes(size_t ndim, const shape_t &axes)
+    {
+    if (ndim==1)
+      {
+      if ((axes.size()!=1) || (axes[0]!=0))
+        throw std::invalid_argument("bad axes");
+      return;
+      }
+    shape_t tmp(ndim,0);
+    if (axes.empty()) throw std::invalid_argument("no axes specified");
+    for (auto ax : axes)
+      {
+      if (ax>=ndim) throw std::invalid_argument("bad axis number");
+      if (++tmp[ax]>1) throw std::invalid_argument("axis specified repeatedly");
+      }
+    }
+
+  DUCC0_NOINLINE static void sanity_check_onetype(const fmav_info &a1,
+    const fmav_info &a2, bool inplace, const shape_t &axes)
+    {
+    sanity_check_axes(a1.ndim(), axes);
+    MR_assert(a1.conformable(a2), "array sizes are not conformable");
+    if (inplace) MR_assert(a1.stride()==a2.stride(), "stride mismatch");
+    }
+  DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac,
+    const fmav_info &ar, const shape_t &axes)
+    {
+    sanity_check_axes(ac.ndim(), axes);
+    MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch");
+    for (size_t i=0; i<ac.ndim(); ++i)
+      MR_assert(ac.shape(i) == ((i==axes.back()) ? (ar.shape(i)/2+1) : ar.shape(i)),
+        "axis length mismatch");
+    }
+  DUCC0_NOINLINE static void sanity_check_cr(const fmav_info &ac,
+    const fmav_info &ar, const size_t axis)
+    {
+    if (axis>=ac.ndim()) throw std::invalid_argument("bad axis number");
+    MR_assert(ac.ndim()==ar.ndim(), "dimension mismatch");
+    for (size_t i=0; i<ac.ndim(); ++i)
+      MR_assert(ac.shape(i) == ((i==axis) ? (ar.shape(i)/2+1) : ar.shape(i)),
+        "axis length mismatch");
+    }
+
+  static size_t thread_count (size_t nthreads, const fmav_info &info,
+    size_t axis, size_t vlen)
+    {
+    if (nthreads==1) return 1;
+    size_t size = info.size();
+    if (size<4096) return 1;
+    size_t parallel = size / (info.shape(axis) * vlen);
+    if (info.shape(axis) < 1000)
+      parallel /= 4;
+    parallel = min(parallel, size/4096);
+    size_t max_threads = ducc0::adjust_nthreads(nthreads);
+    return std::max(size_t(1), std::min(parallel, max_threads));
+    }
+  };
+
+//
+// multi-D infrastructure
+//
+
+template<typename T> std::shared_ptr<T> get_plan(size_t length, bool vectorize=false)
+  {
+#ifdef DUCC0_NO_FFT_CACHE
+  return std::make_shared<T>(length, vectorize);
+#else
+  constexpr size_t nmax=10;
+  struct entry { size_t n; bool vectorize; std::shared_ptr<T> ptr; };
+  static std::array<entry, nmax> cache{{{0,0,nullptr}}};
+  static std::array<size_t, nmax> last_access{{0}};
+  static size_t access_counter = 0;
+  static Mutex mut;
+
+  auto find_in_cache = [&]() -> std::shared_ptr<T>
+    {
+    for (size_t i=0; i<nmax; ++i)
+      if (cache[i].ptr && (cache[i].n==length) && (cache[i].vectorize==vectorize))
+        {
+        // no need to update if this is already the most recent entry
+        if (last_access[i]!=access_counter)
+          {
+          last_access[i] = ++access_counter;
+          // Guard against overflow
+          if (access_counter == 0)
+            last_access.fill(0);
+          }
+        return cache[i].ptr;
+        }
+
+    return nullptr;
+    };
+
+  {
+  LockGuard lock(mut);
+
+  auto p = find_in_cache();
+  if (p) return p;
+  }
+  auto plan = std::make_shared<T>(length, vectorize);
+  {
+  LockGuard lock(mut);
+
+  auto p = find_in_cache();
+  if (p) return p;
+
+  size_t lru = 0;
+  for (size_t i=1; i<nmax; ++i)
+    if (last_access[i] < last_access[lru])
+      lru = i;
+
+  cache[lru] = {length,vectorize, plan};
+  last_access[lru] = ++access_counter;
+  }
+  return plan;
+#endif
+  }
+
+template<size_t N> class multi_iter
+  {
+  private:
+    shape_t shp, pos;
+    stride_t str_i, str_o;
+    size_t cshp_i, cshp_o, rem;
+    ptrdiff_t cstr_i, cstr_o, sstr_i, sstr_o, p_ii, p_i[N], p_oi, p_o[N];
+    bool uni_i, uni_o;
+
+    void advance_i()
+      {
+      for (size_t i=0; i<pos.size(); ++i)
+        {
+        p_ii += str_i[i];
+        p_oi += str_o[i];
+        if (++pos[i] < shp[i])
+          return;
+        pos[i] = 0;
+        p_ii -= ptrdiff_t(shp[i])*str_i[i];
+        p_oi -= ptrdiff_t(shp[i])*str_o[i];
+        }
+      }
+
+  public:
+    multi_iter(const fmav_info &iarr, const fmav_info &oarr, size_t idim,
+      size_t nshares, size_t myshare)
+      : rem(iarr.size()/iarr.shape(idim)), sstr_i(0), sstr_o(0), p_ii(0), p_oi(0)
+      {
+      MR_assert(oarr.ndim()==iarr.ndim(), "dimension mismatch");
+      MR_assert(iarr.ndim()>=1, "not enough dimensions");
+      // Sort the extraneous dimensions in order of ascending output stride;
+      // this should improve overall cache re-use and avoid clashes between
+      // threads as much as possible.
+      shape_t idx(iarr.ndim());
+      std::iota(idx.begin(), idx.end(), 0);
+      sort(idx.begin(), idx.end(),
+        [&oarr](size_t i1, size_t i2) {return oarr.stride(i1) < oarr.stride(i2);});
+      for (auto i: idx)
+        if (i!=idim)
+          {
+          pos.push_back(0);
+          MR_assert(iarr.shape(i)==oarr.shape(i), "shape mismatch");
+          shp.push_back(iarr.shape(i));
+          str_i.push_back(iarr.stride(i));
+          str_o.push_back(oarr.stride(i));
+          }
+      MR_assert(idim<iarr.ndim(), "bad active dimension");
+      cstr_i = iarr.stride(idim);
+      cstr_o = oarr.stride(idim);
+      cshp_i = iarr.shape(idim);
+      cshp_o = oarr.shape(idim);
+
+// collapse unneeded dimensions
+      bool done = false;
+      while(!done)
+        {
+        done=true;
+        for (size_t i=1; i<shp.size(); ++i)
+          if ((str_i[i] == str_i[i-1]*ptrdiff_t(shp[i-1]))
+           && (str_o[i] == str_o[i-1]*ptrdiff_t(shp[i-1])))
+            {
+            shp[i-1] *= shp[i];
+            str_i.erase(str_i.begin()+ptrdiff_t(i));
+            str_o.erase(str_o.begin()+ptrdiff_t(i));
+            shp.erase(shp.begin()+ptrdiff_t(i));
+            pos.pop_back();
+            done=false;
+            }
+        }
+      if (pos.size()>0)
+        {
+        sstr_i = str_i[0];
+        sstr_o = str_o[0];
+        }
+
+      if (nshares==1) return;
+      if (nshares==0) throw std::runtime_error("can't run with zero threads");
+      if (myshare>=nshares) throw std::runtime_error("impossible share requested");
+      auto [lo, hi] = calcShare(nshares, myshare, rem);
+      size_t todo = hi-lo;
+
+      size_t chunk = rem;
+      for (size_t i2=0, i=pos.size()-1; i2<pos.size(); ++i2,--i)
+        {
+        chunk /= shp[i];
+        size_t n_advance = lo/chunk;
+        pos[i] += n_advance;
+        p_ii += ptrdiff_t(n_advance)*str_i[i];
+        p_oi += ptrdiff_t(n_advance)*str_o[i];
+        lo -= n_advance*chunk;
+        }
+      MR_assert(lo==0, "must not happen");
+      rem = todo;
+      }
+    void advance(size_t n)
+      {
+      if (rem<n) throw std::runtime_error("underrun");
+      for (size_t i=0; i<n; ++i)
+        {
+        p_i[i] = p_ii;
+        p_o[i] = p_oi;
+        advance_i();
+        }
+      uni_i = uni_o = true;
+      for (size_t i=1; i<n; ++i)
+        {
+        uni_i = uni_i && (p_i[i]-p_i[i-1] == sstr_i);
+        uni_o = uni_o && (p_o[i]-p_o[i-1] == sstr_o);
+        }
+      rem -= n;
+      }
+    ptrdiff_t iofs(size_t i) const { return p_i[0] + ptrdiff_t(i)*cstr_i; }
+    ptrdiff_t iofs(size_t j, size_t i) const { return p_i[j] + ptrdiff_t(i)*cstr_i; }
+    ptrdiff_t iofs_uni(size_t j, size_t i) const { return p_i[0] + ptrdiff_t(j)*sstr_i + ptrdiff_t(i)*cstr_i; }
+    ptrdiff_t oofs(size_t i) const { return p_o[0] + ptrdiff_t(i)*cstr_o; }
+    ptrdiff_t oofs(size_t j, size_t i) const { return p_o[j] + ptrdiff_t(i)*cstr_o; }
+    ptrdiff_t oofs_uni(size_t j, size_t i) const { return p_o[0] + ptrdiff_t(j)*sstr_o + ptrdiff_t(i)*cstr_o; }
+    bool uniform_i() const { return uni_i; }
+    ptrdiff_t unistride_i() const { return sstr_i; }
+    bool uniform_o() const { return uni_o; }
+    ptrdiff_t unistride_o() const { return sstr_o; }
+    size_t length_in() const { return cshp_i; }
+    size_t length_out() const { return cshp_o; }
+    ptrdiff_t stride_in() const { return cstr_i; }
+    ptrdiff_t stride_out() const { return cstr_o; }
+    size_t remaining() const { return rem; }
+    bool critical_stride_trans(size_t tsz) const
+      {
+      return ((abs<ptrdiff_t>(stride_in() *tsz)&4095)==0)
+          || ((abs<ptrdiff_t>(stride_out()*tsz)&4095)==0);
+      }
+    bool critical_stride_other(size_t tsz) const
+      {
+      if (unistride_i()==0) return false;  // it's just one transform
+      return ((abs<ptrdiff_t>(unistride_i()*tsz)&4095)==0)
+          || ((abs<ptrdiff_t>(unistride_o()*tsz)&4095)==0);
+      }
+  };
+
+template<typename T, typename T0> class TmpStorage
+  {
+  private:
+    aligned_array<T> d;
+    size_t dofs, dstride;
+
+  public:
+    TmpStorage(size_t n_trafo, size_t bufsize_data, size_t bufsize_trafo,
+               size_t n_simultaneous, bool inplace)
+      {
+      if (inplace)
+        {
+        d.resize(bufsize_trafo);
+        return;
+        }
+      constexpr auto vlen = fft_simdlen<T0>;
+      // FIXME: when switching to C++20, use bit_floor(othersize)
+      size_t buffct = std::min(vlen, n_trafo);
+      size_t datafct = std::min(vlen, n_trafo);
+      if (n_trafo>=n_simultaneous*vlen) datafct = n_simultaneous*vlen;
+      dstride = bufsize_data;
+      dofs = bufsize_trafo;
+      // critical stride avoidance
+      if ((dstride&256)==0) dstride+=16;
+      if ((dofs&256)==0) dofs += 16;
+      d.resize(buffct*dofs + datafct*dstride);
+      }
+
+    template<typename T2> T2 *transformBuf()
+      { return reinterpret_cast<T2 *>(d.data()); }
+    template<typename T2> T2 *dataBuf()
+      { return reinterpret_cast<T2 *>(d.data()) + dofs; }
+    size_t data_stride() const
+      { return dstride; }
+  };
+
+template<typename T2, typename T, typename T0> class TmpStorage2
+  {
+  private:
+    TmpStorage<T, T0> &stg;
+
+  public:
+    using datatype = T2;
+    TmpStorage2(TmpStorage<T,T0> &stg_): stg(stg_) {}
+
+    T2 *transformBuf() { return stg.template transformBuf<T2>(); }
+    T2 *dataBuf() { return stg.template dataBuf<T2>(); }
+    size_t data_stride() const { return stg.data_stride(); }
+  };
+
+template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<Cmplx<typename Tsimd::value_type>> &src, Cmplx<Tsimd> *DUCC0_RESTRICT dst)
+  {
+  constexpr auto vlen=Tsimd::size();
+  const Cmplx<typename Tsimd::value_type> * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    {
+    Cmplx<Tsimd> tmp;
+    for (size_t j=0; j<vlen; ++j)
+      {
+      tmp.r[j] = ptr[it.iofs(j,i)].r;
+      tmp.i[j] = ptr[it.iofs(j,i)].i;
+      }
+    dst[i] = tmp;
+    }
+  }
+
+template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<typename Tsimd::value_type> &src, Tsimd *DUCC0_RESTRICT dst)
+  {
+  constexpr auto vlen=Tsimd::size();
+  const typename Tsimd::value_type * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    {
+    typename Tsimd::value_type tmp[vlen];
+    for (size_t j=0; j<vlen; ++j)
+      tmp[j] = ptr[it.iofs(j,i)];
+    dst[i] = Tsimd(&tmp[0], element_aligned_tag());
+    }
+  }
+
+template <typename Titer, typename T> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<T> &src, T *DUCC0_RESTRICT dst)
+  {
+  const T * DUCC0_RESTRICT ptr = src.data();
+  if (dst == &src.raw(it.iofs(0))) return;  // in-place
+  for (size_t i=0; i<it.length_in(); ++i)
+    dst[i] = ptr[it.iofs(i)];
+  }
+
+template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const Cmplx<Tsimd> *DUCC0_RESTRICT src, const vfmav<Cmplx<typename Tsimd::value_type>> &dst)
+  {
+  constexpr auto vlen=Tsimd::size();
+  Cmplx<typename Tsimd::value_type> * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    {
+    Cmplx<Tsimd> tmp(src[i]);
+    for (size_t j=0; j<vlen; ++j)
+      ptr[it.oofs(j,i)].Set(tmp.r[j],tmp.i[j]);
+    }
+  }
+
+template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const Tsimd *DUCC0_RESTRICT src, const vfmav<typename Tsimd::value_type> &dst)
+  {
+  constexpr auto vlen=Tsimd::size();
+  typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    {
+    Tsimd tmp = src[i];
+    for (size_t j=0; j<vlen; ++j)
+      ptr[it.oofs(j,i)] = tmp[j];
+    }
+  }
+
+template<typename T, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const T *DUCC0_RESTRICT src, const vfmav<T> &dst)
+  {
+  T * DUCC0_RESTRICT ptr=dst.data();
+  if (src == &dst.raw(it.oofs(0))) return;  // in-place
+  for (size_t i=0; i<it.length_out(); ++i)
+    ptr[it.oofs(i)] = src[i];
+  }
+template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<Cmplx<typename Tsimd::value_type>> &src, Cmplx<Tsimd> * DUCC0_RESTRICT dst, size_t nvec, size_t vstr)
+  {
+  constexpr auto vlen=Tsimd::size();
+  const Cmplx<typename Tsimd::value_type> * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      {
+      typename Tsimd::value_type tmp[2*vlen];
+      for (size_t j1=0; j1<vlen; ++j1)
+        {
+        tmp[j1] = ptr[it.iofs(j0*vlen+j1,i)].r;
+        tmp[j1+vlen] = ptr[it.iofs(j0*vlen+j1,i)].i;
+        }
+
+      dst[j0*vstr+i].r = Tsimd(&tmp[0], element_aligned_tag());
+      dst[j0*vstr+i].i = Tsimd(&tmp[vlen], element_aligned_tag());
+      }
+  }
+template <typename T, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<Cmplx<T>> &src, Cmplx<T> * DUCC0_RESTRICT dst, size_t nvec, size_t vstr)
+  {
+  const Cmplx<T> * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      dst[j0*vstr+i] = ptr[it.iofs(j0,i)];
+  }
+
+template <typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<typename Tsimd::value_type> &src, Tsimd * DUCC0_RESTRICT dst, size_t nvec, size_t vstr)
+  {
+  constexpr auto vlen=Tsimd::size();
+  const typename Tsimd::value_type * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      {
+      typename Tsimd::value_type tmp[vlen];
+      for (size_t j1=0; j1<vlen; ++j1)
+        tmp[j1] = ptr[it.iofs(j0*vlen+j1,i)];
+      dst[j0*vstr+i] = Tsimd(&tmp[0],element_aligned_tag());
+      }
+  }
+
+template <typename T, typename Titer> DUCC0_NOINLINE void copy_input(const Titer &it,
+  const cfmav<T> &src, T * DUCC0_RESTRICT dst, size_t nvec, size_t vstr)
+  {
+  const T * DUCC0_RESTRICT ptr = src.data();
+  for (size_t i=0; i<it.length_in(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      dst[j0*vstr+i] = ptr[it.iofs(j0,i)];
+  }
+
+template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const Cmplx<Tsimd> * DUCC0_RESTRICT src, const vfmav<Cmplx<typename Tsimd::value_type>> &dst, size_t nvec, size_t vstr)
+  {
+  constexpr auto vlen=Tsimd::size();
+  Cmplx<typename Tsimd::value_type> * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      {
+      Cmplx<Tsimd> tmp(src[j0*vstr+i]);
+      for (size_t j1=0; j1<vlen; ++j1)
+        ptr[it.oofs(j0*vlen+j1,i)].Set(tmp.r[j1],tmp.i[j1]);
+      }
+  }
+template<typename T, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const Cmplx<T> * DUCC0_RESTRICT src, const vfmav<Cmplx<T>> &dst, size_t nvec, size_t vstr)
+  {
+  Cmplx<T> * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      ptr[it.oofs(j0,i)] = src[j0*vstr+i];
+  }
+template<typename Tsimd, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const Tsimd * DUCC0_RESTRICT src, const vfmav<typename Tsimd::value_type> &dst, size_t nvec, size_t vstr)
+  {
+  constexpr auto vlen=Tsimd::size();
+  typename Tsimd::value_type * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      {
+      Tsimd tmp(src[j0*vstr+i]);
+      for (size_t j1=0; j1<vlen; ++j1)
+        ptr[it.oofs(j0*vlen+j1,i)] = tmp[j1];
+      }
+  }
+template<typename T, typename Titer> DUCC0_NOINLINE void copy_output(const Titer &it,
+  const T * DUCC0_RESTRICT src, const vfmav<T> &dst, size_t nvec, size_t vstr)
+  {
+  T * DUCC0_RESTRICT ptr = dst.data();
+  for (size_t i=0; i<it.length_out(); ++i)
+    for (size_t j0=0; j0<nvec; ++j0)
+      ptr[it.oofs(j0,i)] = src[j0*vstr+i];
+  }
+
+
+template <typename T, size_t vlen> struct add_vec
+  { using type = typename simd_select<T, vlen>::type; };
+template <typename T, size_t vlen> struct add_vec<Cmplx<T>, vlen>
+  { using type = Cmplx<typename simd_select<T, vlen>::type>; };
+template <typename T, size_t vlen> using add_vec_t = typename add_vec<T, vlen>::type;
+
+template<typename Tplan, typename T, typename T0, typename Exec>
+DUCC0_NOINLINE void general_nd(const cfmav<T> &in, const vfmav<T> &out,
+  const shape_t &axes, T0 fct, size_t nthreads, const Exec &exec,
+  const bool /*allow_inplace*/=true)
+  {
+  if ((in.ndim()==1)&&(in.stride(0)==1)&&(out.stride(0)==1))
+    {
+    auto plan = get_plan<Tplan>(in.shape(0), true);
+    exec.exec_simple(in.data(), out.data(), *plan, fct, nthreads);
+    return;
+    }
+  std::shared_ptr<Tplan> plan, vplan;
+  size_t nth1d = (in.ndim()==1) ? nthreads : 1;
+
+  for (size_t iax=0; iax<axes.size(); ++iax)
+    {
+    size_t len=in.shape(axes[iax]);
+    if ((!plan) || (len!=plan->length()))
+      {
+      plan = get_plan<Tplan>(len, in.ndim()==1);
+      vplan = ((in.ndim()==1)||(len<300)||((len&3)!=0)) ?
+        plan : get_plan<Tplan>(len, true);
+      }
+
+    execParallel(util::thread_count(nthreads, in, axes[iax], fft_simdlen<T0>),
+      [&](Scheduler &sched)
+      {
+      constexpr auto vlen = fft_simdlen<T0>;
+      constexpr size_t nmax = 16;
+      const auto &tin(iax==0? in : out);
+      multi_iter<nmax> it(tin, out, axes[iax], sched.num_threads(), sched.thread_num());
+
+      // n_simul: vector size
+      // n_bunch: total size of bunch (multiple of n_simul)
+      size_t n_simul=1, n_bunch=1;
+      bool critstride = (((in.stride(axes[iax])*sizeof(T))&4095)==0)
+                     || (((out.stride(axes[iax])*sizeof(T))&4095)==0);
+      bool nostride = (in.stride(axes[iax])==1) && (out.stride(axes[iax])==1);
+
+      constexpr size_t l2cache=262144*2;
+      constexpr size_t cacheline=64;
+
+      // working set size
+      auto wss = [&](size_t vl) { return sizeof(T)*(2*len*vl + plan->bufsize()); };
+      // is the FFT small enough to fit into L2 vectorized?
+      if (wss(1)>l2cache) // "long" FFT, don't execute more than one at the same time
+        {
+        n_simul=1;
+        if (critstride)  // make bunch large to reduce overall copy cost
+          {
+          n_bunch=n_simul;
+          while ((n_bunch<nmax) && (sizeof(T)*n_bunch<2*cacheline)) n_bunch*=2;
+          }
+        else if (nostride)  // simple scalar "in-place" transform
+          n_bunch=n_simul;
+        else  // we have some strides, use a medium-sized bunch
+          {
+          n_bunch=n_simul;
+          while ((n_bunch<nmax) && (sizeof(T)*n_bunch<cacheline)) n_bunch*=2;
+          }
+        }
+      else  // fairly small individual FFT, vectorizing probably beneficial
+        {
+        // if no stride, only vectorize if vectorized FFT fits into cache
+        // if strided, always vectorize (TBC)
+        n_simul = nostride ? ((wss(vlen)<=l2cache) ? vlen:1) : vlen;
+        if (critstride)  // make bunch large to reduce overall copy cost
+          {
+          n_bunch=n_simul;
+          while ((n_bunch<nmax) /*&& (sizeof(T)*n_bunch<2*cacheline)*/) n_bunch*=2;
+          }
+        else if (nostride)
+          n_bunch=n_simul;
+        else
+          {
+          n_bunch=n_simul;
+          if (n_simul==1)
+            while ((n_bunch<nmax) && (sizeof(T)*n_bunch<cacheline)) n_bunch*=2;
+          }
+        }
+
+      bool inplace = (in.stride(axes[iax])==1) && (out.stride(axes[iax])==1) && (n_bunch==1);
+      MR_assert(n_bunch<=nmax, "must not happen");
+      TmpStorage<T,T0> storage(in.size()/len, len, max(plan->bufsize(),vplan->bufsize()), (n_bunch+vlen-1)/vlen, inplace);
+
+      // first, do all possible steps of size n_bunch, then n_simul
+      if (n_bunch>1)
+        {
+#ifndef DUCC0_NO_SIMD
+        if constexpr (vlen>1)
+          {
+          constexpr size_t lvlen = vlen;
+          if (n_simul>=lvlen)
+            {
+            if ((n_bunch>n_simul) && (it.remaining()>=n_bunch))
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=n_bunch)
+                {
+                it.advance(n_bunch);
+                exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d);
+                }
+              }
+            }
+          if (n_simul==lvlen)
+            {
+            if (it.remaining()>=lvlen)
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=lvlen)
+                {
+                it.advance(lvlen);
+                exec(it, tin, out, storage2, *plan, fct, nth1d);
+                }
+              }
+            }
+          }
+        if constexpr ((vlen>2) && (simd_exists<T0,vlen/2>))
+          {
+          constexpr size_t lvlen = vlen/2;
+          if (n_simul>=lvlen)
+            {
+            if ((n_bunch>n_simul) && (it.remaining()>=n_bunch))
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=n_bunch)
+                {
+                it.advance(n_bunch);
+                exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d);
+                }
+              }
+            }
+          if (n_simul==lvlen)
+            {
+            if (it.remaining()>=lvlen)
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=lvlen)
+                {
+                it.advance(lvlen);
+                exec(it, tin, out, storage2, *plan, fct, nth1d);
+                }
+              }
+            }
+          }
+        if constexpr ((vlen>4) && (simd_exists<T0,vlen/4>))
+          {
+          constexpr size_t lvlen = vlen/4;
+          if (n_simul>=lvlen)
+            {
+            if ((n_bunch>n_simul) && (it.remaining()>=n_bunch))
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=n_bunch)
+                {
+                it.advance(n_bunch);
+                exec.exec_n(it, tin, out, storage2, *plan, fct, n_bunch/lvlen, nth1d);
+                }
+              }
+            }
+          if (n_simul==lvlen)
+            {
+            if (it.remaining()>=lvlen)
+              {
+              TmpStorage2<add_vec_t<T, lvlen>,T,T0> storage2(storage);
+              while (it.remaining()>=lvlen)
+                {
+                it.advance(lvlen);
+                exec(it, tin, out, storage2, *plan, fct, nth1d);
+                }
+              }
+            }
+          }
+#endif
+        {
+        TmpStorage2<T,T,T0> storage2(storage);
+        while ((n_bunch>n_simul) && (it.remaining()>=n_bunch))
+          {
+          it.advance(n_bunch);
+          exec.exec_n(it, tin, out, storage2, *vplan, fct, n_bunch, nth1d);
+          }
+        }
+        }
+        {
+        TmpStorage2<T,T,T0> storage2(storage);
+        while (it.remaining()>0)
+          {
+          it.advance(1);
+          exec(it, tin, out, storage2, *vplan, fct, nth1d, inplace);
+          }
+        }
+      });  // end of parallel region
+    fct = T0(1); // factor has been applied, use 1 for remaining axes
+    }
+  }
+
+struct ExecC2C
+  {
+  bool forward;
+
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void operator() (
+    const Titer &it, const cfmav<Cmplx<T0>> &in,
+    const vfmav<Cmplx<T0>> &out, Tstorage &storage, const pocketfft_c<T0> &plan, T0 fct,
+    size_t nthreads, bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<Cmplx<T0>, T>::value)
+      if (inplace)
+        {
+        if (in.data()!=out.data())
+          copy_input(it, in, out.data()+it.oofs(0));
+        plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads);
+        return;
+        }
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan.exec(buf2, buf1, fct, forward, nthreads);
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<Cmplx<T0>> &in,
+    const vfmav<Cmplx<T0>> &out, Tstorage &storage, const pocketfft_c<T0> &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, forward, nthreads);
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0> DUCC0_NOINLINE void exec_simple (
+    const Cmplx<T0> *in, Cmplx<T0> *out, const pocketfft_c<T0> &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    plan.exec(out, fct, forward, nthreads);
+    }
+  };
+
+struct ExecHartley
+  {
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void operator() (
+    const Titer &it, const cfmav<T0> &in, const vfmav<T0> &out,
+    Tstorage &storage, const pocketfft_hartley<T0> &plan, T0 fct, size_t nthreads,
+    bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<T0, T>::value)
+      if (inplace)
+        {
+        if (in.data()!=out.data())
+          copy_input(it, in, out.data()+it.oofs(0));
+        plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads);
+        return;
+        }
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan.exec(buf2, buf1, fct, nthreads);
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<T0> &in,
+    const vfmav<T0> &out, Tstorage &storage, const pocketfft_hartley<T0> &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, nthreads);
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0> DUCC0_NOINLINE void exec_simple (
+    const T0 *in, T0 *out, const pocketfft_hartley<T0> &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    plan.exec(out, fct, nthreads);
+    }
+  };
+
+struct ExecFHT
+  {
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void operator() (
+    const Titer &it, const cfmav<T0> &in, const vfmav<T0> &out,
+    Tstorage &storage, const pocketfft_fht<T0> &plan, T0 fct, size_t nthreads,
+    bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<T0, T>::value)
+      if (inplace)
+        {
+        if (in.data()!=out.data())
+          copy_input(it, in, out.data()+it.oofs(0));
+        plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, nthreads);
+        return;
+        }
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan.exec(buf2, buf1, fct, nthreads);
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<T0> &in,
+    const vfmav<T0> &out, Tstorage &storage, const pocketfft_fht<T0> &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, nthreads);
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0> DUCC0_NOINLINE void exec_simple (
+    const T0 *in, T0 *out, const pocketfft_fht<T0> &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    plan.exec(out, fct, nthreads);
+    }
+  };
+
+struct ExecFFTW
+  {
+  bool forward;
+
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void operator() (
+    const Titer &it, const cfmav<T0> &in, const vfmav<T0> &out,
+    Tstorage &storage, const pocketfft_fftw<T0> &plan, T0 fct, size_t nthreads,
+    bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<T0, T>::value)
+      if (inplace)
+        {
+        if (in.data()!=out.data())
+          copy_input(it, in, out.data()+it.oofs(0));
+        plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, forward, nthreads);
+        return;
+        }
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan.exec(buf2, buf1, fct, forward, nthreads);
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<T0> &in,
+    const vfmav<T0> &out, Tstorage &storage, const pocketfft_fftw<T0> &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, forward, nthreads);
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0> DUCC0_NOINLINE void exec_simple (
+    const T0 *in, T0 *out, const pocketfft_fftw<T0> &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    plan.exec(out, fct, forward, nthreads);
+    }
+  };
+
+struct ExecDcst
+  {
+  bool ortho;
+  int type;
+  bool cosine;
+
+  template <typename T0, typename Tstorage, typename Tplan, typename Titer>
+  DUCC0_NOINLINE void operator() (const Titer &it, const cfmav<T0> &in,
+    const vfmav <T0> &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nthreads,
+    bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<T0, T>::value)
+      if (inplace)
+        {
+        if (in.data()!=out.data())
+          copy_input(it, in, out.data()+it.oofs(0));
+        plan.exec_copyback(out.data()+it.oofs(0), storage.transformBuf(), fct, ortho, type, cosine, nthreads);
+        return;
+        }
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan.exec(buf2, buf1, fct, ortho, type, cosine, nthreads);
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Tplan, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<T0> &in,
+    const vfmav<T0> &out, Tstorage &storage, const Tplan &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, ortho, type, cosine, nthreads);
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0, typename Tplan> DUCC0_NOINLINE void exec_simple (
+    const T0 *in, T0 *out, const Tplan &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    plan.exec(out, fct, ortho, type, cosine, nthreads);
+    }
+  };
+
+template<typename T> DUCC0_NOINLINE void general_r2c(
+  const cfmav<T> &in, const vfmav<Cmplx<T>> &out, size_t axis, bool forward, T fct,
+  size_t nthreads)
+  {
+  size_t nth1d = (in.ndim()==1) ? nthreads : 1;
+  auto plan = std::make_unique<pocketfft_r<T>>(in.shape(axis));
+  size_t len=in.shape(axis);
+  execParallel(
+    util::thread_count(nthreads, in, axis, fft_simdlen<T>),
+    [&](Scheduler &sched) {
+    constexpr auto vlen = fft_simdlen<T>;
+    TmpStorage<T,T> storage(in.size()/len, len, plan->bufsize(), 1, false);
+    multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
+#ifndef DUCC0_NO_SIMD
+    if constexpr (vlen>1)
+      {
+      TmpStorage2<add_vec_t<T, vlen>,T,T> storage2(storage);
+      auto dbuf = storage2.dataBuf();
+      auto tbuf = storage2.transformBuf();
+      while (it.remaining()>=vlen)
+        {
+        it.advance(vlen);
+        copy_input(it, in, dbuf);
+        auto res = plan->exec(dbuf, tbuf, fct, true, nth1d);
+        auto vout = out.data();
+        for (size_t j=0; j<vlen; ++j)
+          vout[it.oofs(j,0)].Set(res[0][j]);
+        size_t i=1, ii=1;
+        if (forward)
+          for (; i<len-1; i+=2, ++ii)
+            for (size_t j=0; j<vlen; ++j)
+              vout[it.oofs(j,ii)].Set(res[i][j], res[i+1][j]);
+        else
+          for (; i<len-1; i+=2, ++ii)
+            for (size_t j=0; j<vlen; ++j)
+              vout[it.oofs(j,ii)].Set(res[i][j], -res[i+1][j]);
+        if (i<len)
+          for (size_t j=0; j<vlen; ++j)
+            vout[it.oofs(j,ii)].Set(res[i][j]);
+        }
+      }
+    if constexpr (vlen>2)
+      if constexpr (simd_exists<T,vlen/2>)
+        if (it.remaining()>=vlen/2)
+          {
+          TmpStorage2<add_vec_t<T, vlen/2>,T,T> storage2(storage);
+          auto dbuf = storage2.dataBuf();
+          auto tbuf = storage2.transformBuf();
+          it.advance(vlen/2);
+          copy_input(it, in, dbuf);
+          auto res = plan->exec(dbuf, tbuf, fct, true, nth1d);
+          auto vout = out.data();
+          for (size_t j=0; j<vlen/2; ++j)
+            vout[it.oofs(j,0)].Set(res[0][j]);
+          size_t i=1, ii=1;
+          if (forward)
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen/2; ++j)
+                vout[it.oofs(j,ii)].Set(res[i][j], res[i+1][j]);
+          else
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen/2; ++j)
+                vout[it.oofs(j,ii)].Set(res[i][j], -res[i+1][j]);
+          if (i<len)
+            for (size_t j=0; j<vlen/2; ++j)
+              vout[it.oofs(j,ii)].Set(res[i][j]);
+          }
+    if constexpr (vlen>4)
+      if constexpr( simd_exists<T,vlen/4>)
+        if (it.remaining()>=vlen/4)
+          {
+          TmpStorage2<add_vec_t<T, vlen/4>,T,T> storage2(storage);
+          auto dbuf = storage2.dataBuf();
+          auto tbuf = storage2.transformBuf();
+          it.advance(vlen/4);
+          copy_input(it, in, dbuf);
+          auto res = plan->exec(dbuf, tbuf, fct, true, nth1d);
+          auto vout = out.data();
+          for (size_t j=0; j<vlen/4; ++j)
+            vout[it.oofs(j,0)].Set(res[0][j]);
+          size_t i=1, ii=1;
+          if (forward)
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen/4; ++j)
+                vout[it.oofs(j,ii)].Set(res[i][j], res[i+1][j]);
+          else
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen/4; ++j)
+                vout[it.oofs(j,ii)].Set(res[i][j], -res[i+1][j]);
+          if (i<len)
+            for (size_t j=0; j<vlen/4; ++j)
+              vout[it.oofs(j,ii)].Set(res[i][j]);
+          }
+#endif
+    {
+    TmpStorage2<T,T,T> storage2(storage);
+    auto dbuf = storage2.dataBuf();
+    auto tbuf = storage2.transformBuf();
+    while (it.remaining()>0)
+      {
+      it.advance(1);
+      copy_input(it, in, dbuf);
+      auto res = plan->exec(dbuf, tbuf, fct, true, nth1d);
+      auto vout = out.data();
+      vout[it.oofs(0)].Set(res[0]);
+      size_t i=1, ii=1;
+      if (forward)
+        for (; i<len-1; i+=2, ++ii)
+          vout[it.oofs(ii)].Set(res[i], res[i+1]);
+      else
+        for (; i<len-1; i+=2, ++ii)
+          vout[it.oofs(ii)].Set(res[i], -res[i+1]);
+      if (i<len)
+        vout[it.oofs(ii)].Set(res[i]);
+      }
+    }
+    });  // end of parallel region
+  }
+template<typename T> DUCC0_NOINLINE void general_c2r(
+  const cfmav<Cmplx<T>> &in, const vfmav<T> &out, size_t axis, bool forward, T fct,
+  size_t nthreads)
+  {
+  size_t nth1d = (in.ndim()==1) ? nthreads : 1;
+  auto plan = std::make_unique<pocketfft_r<T>>(out.shape(axis));
+  size_t len=out.shape(axis);
+  execParallel(
+    util::thread_count(nthreads, in, axis, fft_simdlen<T>),
+    [&](Scheduler &sched) {
+      constexpr auto vlen = fft_simdlen<T>;
+      TmpStorage<T,T> storage(out.size()/len, len, plan->bufsize(), 1, false);
+      multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
+#ifndef DUCC0_NO_SIMD
+      if constexpr (vlen>1)
+        {
+        TmpStorage2<add_vec_t<T, vlen>,T,T> storage2(storage);
+        auto dbuf = storage2.dataBuf();
+        auto tbuf = storage2.transformBuf();
+        while (it.remaining()>=vlen)
+          {
+          it.advance(vlen);
+          for (size_t j=0; j<vlen; ++j)
+            dbuf[0][j]=in.raw(it.iofs(j,0)).r;
+          {
+          size_t i=1, ii=1;
+          if (forward)
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen; ++j)
+                {
+                dbuf[i  ][j] =  in.raw(it.iofs(j,ii)).r;
+                dbuf[i+1][j] = -in.raw(it.iofs(j,ii)).i;
+                }
+          else
+            for (; i<len-1; i+=2, ++ii)
+              for (size_t j=0; j<vlen; ++j)
+                {
+                dbuf[i  ][j] = in.raw(it.iofs(j,ii)).r;
+                dbuf[i+1][j] = in.raw(it.iofs(j,ii)).i;
+                }
+          if (i<len)
+            for (size_t j=0; j<vlen; ++j)
+              dbuf[i][j] = in.raw(it.iofs(j,ii)).r;
+          }
+          auto res = plan->exec(dbuf, tbuf, fct, false, nth1d);
+          copy_output(it, res, out);
+          }
+        }
+      if constexpr (vlen>2)
+        if constexpr (simd_exists<T,vlen/2>)
+          if (it.remaining()>=vlen/2)
+            {
+            TmpStorage2<add_vec_t<T, vlen/2>,T,T> storage2(storage);
+            auto dbuf = storage2.dataBuf();
+            auto tbuf = storage2.transformBuf();
+            it.advance(vlen/2);
+            for (size_t j=0; j<vlen/2; ++j)
+              dbuf[0][j]=in.raw(it.iofs(j,0)).r;
+            {
+            size_t i=1, ii=1;
+            if (forward)
+              for (; i<len-1; i+=2, ++ii)
+                for (size_t j=0; j<vlen/2; ++j)
+                  {
+                  dbuf[i  ][j] =  in.raw(it.iofs(j,ii)).r;
+                  dbuf[i+1][j] = -in.raw(it.iofs(j,ii)).i;
+                  }
+            else
+              for (; i<len-1; i+=2, ++ii)
+                for (size_t j=0; j<vlen/2; ++j)
+                  {
+                  dbuf[i  ][j] = in.raw(it.iofs(j,ii)).r;
+                  dbuf[i+1][j] = in.raw(it.iofs(j,ii)).i;
+                  }
+            if (i<len)
+              for (size_t j=0; j<vlen/2; ++j)
+                dbuf[i][j] = in.raw(it.iofs(j,ii)).r;
+            }
+            auto res = plan->exec(dbuf, tbuf, fct, false, nth1d);
+            copy_output(it, res, out);
+            }
+      if constexpr (vlen>4)
+        if constexpr(simd_exists<T,vlen/4>)
+          if (it.remaining()>=vlen/4)
+            {
+            TmpStorage2<add_vec_t<T, vlen/4>,T,T> storage2(storage);
+            auto dbuf = storage2.dataBuf();
+            auto tbuf = storage2.transformBuf();
+            it.advance(vlen/4);
+            for (size_t j=0; j<vlen/4; ++j)
+              dbuf[0][j]=in.raw(it.iofs(j,0)).r;
+            {
+            size_t i=1, ii=1;
+            if (forward)
+              for (; i<len-1; i+=2, ++ii)
+                for (size_t j=0; j<vlen/4; ++j)
+                  {
+                  dbuf[i  ][j] =  in.raw(it.iofs(j,ii)).r;
+                  dbuf[i+1][j] = -in.raw(it.iofs(j,ii)).i;
+                  }
+            else
+              for (; i<len-1; i+=2, ++ii)
+                for (size_t j=0; j<vlen/4; ++j)
+                  {
+                  dbuf[i  ][j] = in.raw(it.iofs(j,ii)).r;
+                  dbuf[i+1][j] = in.raw(it.iofs(j,ii)).i;
+                  }
+            if (i<len)
+              for (size_t j=0; j<vlen/4; ++j)
+                dbuf[i][j] = in.raw(it.iofs(j,ii)).r;
+            }
+            auto res = plan->exec(dbuf, tbuf, fct, false, nth1d);
+            copy_output(it, res, out);
+            }
+#endif
+      {
+      TmpStorage2<T,T,T> storage2(storage);
+      auto dbuf = storage2.dataBuf();
+      auto tbuf = storage2.transformBuf();
+      while (it.remaining()>0)
+        {
+        it.advance(1);
+        dbuf[0]=in.raw(it.iofs(0)).r;
+        {
+        size_t i=1, ii=1;
+        if (forward)
+          for (; i<len-1; i+=2, ++ii)
+            {
+            dbuf[i  ] =  in.raw(it.iofs(ii)).r;
+            dbuf[i+1] = -in.raw(it.iofs(ii)).i;
+            }
+        else
+          for (; i<len-1; i+=2, ++ii)
+            {
+            dbuf[i  ] = in.raw(it.iofs(ii)).r;
+            dbuf[i+1] = in.raw(it.iofs(ii)).i;
+            }
+        if (i<len)
+          dbuf[i] = in.raw(it.iofs(ii)).r;
+        }
+        auto res = plan->exec(dbuf, tbuf, fct, false, nth1d);
+        copy_output(it, res, out);
+        }
+      }
+    });  // end of parallel region
+  }
+
+struct ExecR2R
+  {
+  bool r2c, forward;
+
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void operator() (
+    const Titer &it, const cfmav<T0> &in, const vfmav<T0> &out, Tstorage &storage,
+    const pocketfft_r<T0> &plan, T0 fct, size_t nthreads,
+    bool inplace=false) const
+    {
+    using T = typename Tstorage::datatype;
+    if constexpr(is_same<T0, T>::value)
+      if (inplace)
+        {
+        T *buf1=storage.transformBuf(), *buf2=out.data()+it.oofs(0);
+        if (in.data()!=buf2)
+          copy_input(it, in, buf2);
+        if ((!r2c) && forward)
+          for (size_t i=2; i<it.length_out(); i+=2)
+            buf2[i] = -buf2[i];
+        plan.exec_copyback(buf2, buf1, fct, r2c, nthreads);
+        if (r2c && (!forward))
+          for (size_t i=2; i<it.length_out(); i+=2)
+            buf2[i] = -buf2[i];
+        return;
+        }
+
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    if ((!r2c) && forward)
+      for (size_t i=2; i<it.length_out(); i+=2)
+        buf2[i] = -buf2[i];
+    auto res = plan.exec(buf2, buf1, fct, r2c, nthreads);
+    if (r2c && (!forward))
+      for (size_t i=2; i<it.length_out(); i+=2)
+        res[i] = -res[i];
+    copy_output(it, res, out);
+    }
+  template <typename T0, typename Tstorage, typename Titer> DUCC0_NOINLINE void exec_n (
+    const Titer &it, const cfmav<T0> &in,
+    const vfmav<T0> &out, Tstorage &storage, const pocketfft_r<T0> &plan, T0 fct, size_t nvec,
+    size_t nthreads) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t dstr = storage.data_stride();
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2, nvec, dstr);
+    if ((!r2c) && forward)
+      for (size_t k=0; k<nvec; ++k)
+        for (size_t i=2; i<it.length_out(); i+=2)
+          buf2[i+k*dstr] = -buf2[i+k*dstr];
+    for (size_t i=0; i<nvec; ++i)
+      plan.exec_copyback(buf2+i*dstr, buf1, fct, r2c, nthreads);
+    if (r2c && (!forward))
+      for (size_t k=0; k<nvec; ++k)
+        for (size_t i=2; i<it.length_out(); i+=2)
+          buf2[i+k*dstr] = -buf2[i+k*dstr];
+    copy_output(it, buf2, out, nvec, dstr);
+    }
+  template <typename T0> DUCC0_NOINLINE void exec_simple (
+    const T0 *in, T0 *out, const pocketfft_r<T0> &plan, T0 fct,
+    size_t nthreads) const
+    {
+    if (in!=out) copy_n(in, plan.length(), out);
+    if ((!r2c) && forward)
+      for (size_t i=2; i<plan.length(); i+=2)
+        out[i] = -out[i];
+    plan.exec(out, fct, r2c, nthreads);
+    if (r2c && (!forward))
+      for (size_t i=2; i<plan.length(); i+=2)
+        out[i] = -out[i];
+    }
+  };
+
+template<typename T> DUCC0_NOINLINE void c2c(const cfmav<std::complex<T>> &in,
+  const vfmav<std::complex<T>> &out, const shape_t &axes, bool forward,
+  T fct, size_t nthreads)
+  {
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  const auto &in2(reinterpret_cast<const cfmav<Cmplx<T> >&>(in));
+  const auto &out2(reinterpret_cast<const vfmav<Cmplx<T> >&>(out));
+  if ((axes.size()>1) && (in.data()!=out.data())) // optimize axis order
+    {
+    if ((in.stride(axes[0])!=1)&&(out.stride(axes[0])==1))
+      {
+      shape_t axes2(axes);
+      swap(axes2[0],axes2.back());
+      general_nd<pocketfft_c<T>>(in2, out2, axes2, fct, nthreads, ExecC2C{forward});
+      return;
+      }
+    for (size_t i=1; i<axes.size(); ++i)
+      if (in.stride(axes[i])==1)
+        {
+        shape_t axes2(axes);
+        swap(axes2[0],axes2[i]);
+        general_nd<pocketfft_c<T>>(in2, out2, axes2, fct, nthreads, ExecC2C{forward});
+        return;
+        }
+    }
+  general_nd<pocketfft_c<T>>(in2, out2, axes, fct, nthreads, ExecC2C{forward});
+  }
+
+template<typename T> DUCC0_NOINLINE void dct(const cfmav<T> &in, const vfmav<T> &out,
+  const shape_t &axes, int type, T fct, bool ortho, size_t nthreads)
+  {
+  if ((type<1) || (type>4)) throw std::invalid_argument("invalid DCT type");
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  const ExecDcst exec{ortho, type, true};
+  if (type==1)
+    general_nd<T_dct1<T>>(in, out, axes, fct, nthreads, exec);
+  else if (type==4)
+    general_nd<T_dcst4<T>>(in, out, axes, fct, nthreads, exec);
+  else
+    general_nd<T_dcst23<T>>(in, out, axes, fct, nthreads, exec);
+  }
+
+template<typename T> DUCC0_NOINLINE void dst(const cfmav<T> &in, const vfmav<T> &out,
+  const shape_t &axes, int type, T fct, bool ortho, size_t nthreads)
+  {
+  if ((type<1) || (type>4)) throw std::invalid_argument("invalid DST type");
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  const ExecDcst exec{ortho, type, false};
+  if (type==1)
+    general_nd<T_dst1<T>>(in, out, axes, fct, nthreads, exec);
+  else if (type==4)
+    general_nd<T_dcst4<T>>(in, out, axes, fct, nthreads, exec);
+  else
+    general_nd<T_dcst23<T>>(in, out, axes, fct, nthreads, exec);
+  }
+
+template<typename T> DUCC0_NOINLINE void r2c(const cfmav<T> &in,
+  const vfmav<std::complex<T>> &out, size_t axis, bool forward, T fct,
+  size_t nthreads)
+  {
+  util::sanity_check_cr(out, in, axis);
+  if (in.size()==0) return;
+  const auto &out2(reinterpret_cast<const vfmav<Cmplx<T>>&>(out));
+  general_r2c(in, out2, axis, forward, fct, nthreads);
+  }
+
+template<typename T> DUCC0_NOINLINE void r2c(const cfmav<T> &in,
+  const vfmav<std::complex<T>> &out, const shape_t &axes,
+  bool forward, T fct, size_t nthreads)
+  {
+  util::sanity_check_cr(out, in, axes);
+  if (in.size()==0) return;
+  r2c(in, out, axes.back(), forward, fct, nthreads);
+  if (axes.size()==1) return;
+
+  auto newaxes = shape_t{axes.begin(), --axes.end()};
+  c2c(out, out, newaxes, forward, T(1), nthreads);
+  }
+
+template<typename T> DUCC0_NOINLINE void c2r(const cfmav<std::complex<T>> &in,
+  const vfmav<T> &out,  size_t axis, bool forward, T fct, size_t nthreads)
+  {
+  util::sanity_check_cr(in, out, axis);
+  if (in.size()==0) return;
+  const auto &in2(reinterpret_cast<const cfmav<Cmplx<T>>&>(in));
+  general_c2r(in2, out, axis, forward, fct, nthreads);
+  }
+
+template<typename T> DUCC0_NOINLINE void c2r(const cfmav<std::complex<T>> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward, T fct,
+  size_t nthreads)
+  {
+  if (axes.size()==1)
+    return c2r(in, out, axes[0], forward, fct, nthreads);
+  util::sanity_check_cr(in, out, axes);
+  if (in.size()==0) return;
+  auto atmp(vfmav<std::complex<T>>::build_noncritical(in.shape(), UNINITIALIZED));
+  auto newaxes = shape_t{axes.begin(), --axes.end()};
+  c2c(in, atmp, newaxes, forward, T(1), nthreads);
+  c2r(atmp, out, axes.back(), forward, fct, nthreads);
+  }
+
+template<typename T> DUCC0_NOINLINE void c2r_mut(const vfmav<std::complex<T>> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward, T fct,
+  size_t nthreads)
+  {
+  if (axes.size()==1)
+    return c2r(in, out, axes[0], forward, fct, nthreads);
+  util::sanity_check_cr(in, out, axes);
+  if (in.size()==0) return;
+  auto newaxes = shape_t{axes.begin(), --axes.end()};
+  c2c(in, in, newaxes, forward, T(1), nthreads);
+  c2r(in, out, axes.back(), forward, fct, nthreads);
+  }
+
+template<typename T> DUCC0_NOINLINE void r2r_fftpack(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, bool real2hermitian, bool forward,
+  T fct, size_t nthreads)
+  {
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  general_nd<pocketfft_r<T>>(in, out, axes, fct, nthreads,
+    ExecR2R{real2hermitian, forward});
+  }
+
+template<typename T> DUCC0_NOINLINE void r2r_fftw(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, bool forward,
+  T fct, size_t nthreads)
+  {
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  general_nd<pocketfft_fftw<T>>(in, out, axes, fct, nthreads,
+    ExecFFTW{forward});
+  }
+
+template<typename T> DUCC0_NOINLINE void r2r_separable_hartley(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads)
+  {
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  general_nd<pocketfft_hartley<T>>(in, out, axes, fct, nthreads,
+    ExecHartley{}, false);
+  }
+
+template<typename T> DUCC0_NOINLINE void r2r_separable_fht(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads)
+  {
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  general_nd<pocketfft_fht<T>>(in, out, axes, fct, nthreads,
+    ExecFHT{}, false);
+  }
+
+template<typename T0, typename T1, typename Func> void hermiteHelper(size_t idim, ptrdiff_t iin,
+  ptrdiff_t iout0, ptrdiff_t iout1, const cfmav<T0> &c,
+  const vfmav<T1> &r, const shape_t &axes, Func func, size_t nthreads)
+  {
+  auto cstr=c.stride(idim), str=r.stride(idim);
+  auto len=r.shape(idim);
+
+  if (idim+1==c.ndim())  // last dimension, not much gain in parallelizing
+    {
+    if (idim==axes.back())  // halfcomplex axis
+      for (size_t i=0,ic=0; i<len/2+1; ++i,ic=len-i)
+        func (c.raw(iin+i*cstr), r.raw(iout0+i*str), r.raw(iout1+ic*str));
+    else if (find(axes.begin(), axes.end(), idim) != axes.end())  // FFT axis
+      for (size_t i=0,ic=0; i<len; ++i,ic=len-i)
+        func (c.raw(iin+i*cstr), r.raw(iout0+i*str), r.raw(iout1+ic*str));
+    else  // non-FFT axis
+      for (size_t i=0; i<len; ++i)
+        func (c.raw(iin+i*cstr), r.raw(iout0+i*str), r.raw(iout1+i*str));
+    }
+  else
+    {
+    if (idim==axes.back())
+      {
+      if (nthreads==1)
+        for (size_t i=0,ic=0; i<len/2+1; ++i,ic=len-i)
+          hermiteHelper(idim+1, iin+i*cstr, iout0+i*str, iout1+ic*str, c, r, axes, func, 1);
+      else
+        execParallel(0, len/2+1, nthreads, [&](size_t lo, size_t hi)
+          {
+          for (size_t i=lo,ic=(i==0?0:len-i); i<hi; ++i,ic=len-i)
+            hermiteHelper(idim+1, iin+i*cstr, iout0+i*str, iout1+ic*str, c, r, axes, func, 1);
+          });
+      }
+    else if (find(axes.begin(), axes.end(), idim) != axes.end())
+      {
+      if (nthreads==1)
+        for (size_t i=0,ic=0; i<len; ++i,ic=len-i)
+          hermiteHelper(idim+1, iin+i*cstr, iout0+i*str, iout1+ic*str, c, r, axes, func, 1);
+      else
+        execParallel(0, len/2+1, nthreads, [&](size_t lo, size_t hi)
+          {
+          for (size_t i=lo,ic=(i==0?0:len-i); i<hi; ++i,ic=len-i)
+            {
+            size_t io0=iout0+i*str, io1=iout1+ic*str;
+            hermiteHelper(idim+1, iin+i*cstr, io0, io1, c, r, axes, func, 1);
+            if (i!=ic)
+              hermiteHelper(idim+1, iin+ic*cstr, io1, io0, c, r, axes, func, 1);
+            }
+          });
+      }
+    else
+      {
+      if (nthreads==1)
+        for (size_t i=0; i<len; ++i)
+          hermiteHelper(idim+1, iin+i*cstr, iout0+i*str, iout1+i*str, c, r, axes, func, 1);
+      else
+        execParallel(0, len, nthreads, [&](size_t lo, size_t hi)
+          {
+          for (size_t i=lo; i<hi; ++i)
+            hermiteHelper(idim+1, iin+i*cstr, iout0+i*str, iout1+i*str, c, r, axes, func, 1);
+          });
+      }
+    }
+  }
+
+template<typename T> void oscarize(const vfmav<T> &data, size_t ax0, size_t ax1,
+  size_t nthreads)
+  {
+  auto nu=data.shape(ax0), nv=data.shape(ax1);
+  if ((nu<3)||(nv<3)) return;
+  vector<slice> slc(data.ndim());
+  slc[ax0] = slice(1,(nu+1)/2);
+  slc[ax1] = slice(1,(nv+1)/2);
+  auto all = subarray(data, slc);
+  slc[ax0] = slice(nu-1,nu/2,-1);
+  auto ahl = subarray(data, slc);
+  slc[ax1] = slice(nv-1,nv/2,-1);
+  auto ahh = subarray(data, slc);
+  slc[ax0] = slice(1,(nu+1)/2);
+  auto alh = subarray(data, slc);
+  mav_apply([](T &ll, T &hl, T &hh, T &lh)
+    {
+    T tll=ll, thl=hl, tlh=lh, thh=hh;
+    T v = T(0.5)*(tll+tlh+thl+thh);
+    ll = v-thh;
+    hl = v-tlh;
+    lh = v-thl;
+    hh = v-tll;
+    }, nthreads, all, ahl, ahh, alh);
+  }
+
+template<typename T> void r2r_genuine_hartley(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads)
+  {
+  if (axes.size()==1)
+    return r2r_separable_hartley(in, out, axes, fct, nthreads);
+  if (axes.size()==2)
+    {
+    r2r_separable_hartley(in, out, axes, fct, nthreads);
+    oscarize(out, axes[0], axes[1], nthreads);
+    return;
+    }
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  shape_t tshp(in.shape());
+  tshp[axes.back()] = tshp[axes.back()]/2+1;
+  auto atmp(vfmav<std::complex<T>>::build_noncritical(tshp, UNINITIALIZED));
+  r2c(in, atmp, axes, true, fct, nthreads);
+  hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex<T> &c, T &r0, T &r1)
+    {
+    r0 = c.real()+c.imag();
+    r1 = c.real()-c.imag();
+    }, nthreads);
+  }
+
+template<typename T> void r2r_genuine_fht(const cfmav<T> &in,
+  const vfmav<T> &out, const shape_t &axes, T fct, size_t nthreads)
+  {
+  if (axes.size()==1)
+    return r2r_separable_fht(in, out, axes, fct, nthreads);
+  if (axes.size()==2)
+    {
+    r2r_separable_fht(in, out, axes, fct, nthreads);
+    oscarize(out, axes[0], axes[1], nthreads);
+    return;
+    }
+  util::sanity_check_onetype(in, out, in.data()==out.data(), axes);
+  if (in.size()==0) return;
+  shape_t tshp(in.shape());
+  tshp[axes.back()] = tshp[axes.back()]/2+1;
+  auto atmp(vfmav<std::complex<T>>::build_noncritical(tshp, UNINITIALIZED));
+  r2c(in, atmp, axes, true, fct, nthreads);
+  hermiteHelper(0, 0, 0, 0, atmp, out, axes, [](const std::complex<T> &c, T &r0, T &r1)
+    {
+    r0 = c.real()-c.imag();
+    r1 = c.real()+c.imag();
+    }, nthreads);
+  }
+
+template<typename Tplan, typename T0, typename T, typename Exec>
+DUCC0_NOINLINE void general_convolve_axis(const cfmav<T> &in, const vfmav<T> &out,
+  const size_t axis, const cmav<T,1> &kernel, size_t nthreads,
+  const Exec &exec)
+  {
+  std::unique_ptr<Tplan> plan1, plan2;
+
+  size_t l_in=in.shape(axis), l_out=out.shape(axis);
+  MR_assert(kernel.size()==l_in, "bad kernel size");
+  plan1 = std::make_unique<Tplan>(l_in);
+  plan2 = std::make_unique<Tplan>(l_out);
+  size_t bufsz = max(plan1->bufsize(), plan2->bufsize());
+
+  vmav<T,1> fkernel({kernel.shape(0)}, UNINITIALIZED);
+  for (size_t i=0; i<kernel.shape(0); ++i)
+    fkernel(i) = kernel(i);
+  plan1->exec(fkernel.data(), T0(1)/T0(l_in), true, nthreads);
+
+  execParallel(
+    util::thread_count(nthreads, in, axis, fft_simdlen<T0>),
+    [&](Scheduler &sched) {
+      constexpr auto vlen = fft_simdlen<T0>;
+      TmpStorage<T,T0> storage(in.size()/l_in, l_in+l_out, bufsz, 1, false);
+      multi_iter<vlen> it(in, out, axis, sched.num_threads(), sched.thread_num());
+#ifndef DUCC0_NO_SIMD
+      if constexpr (vlen>1)
+        {
+        TmpStorage2<add_vec_t<T, vlen>,T,T0> storage2(storage);
+        while (it.remaining()>=vlen)
+          {
+          it.advance(vlen);
+          exec(it, in, out, storage2, *plan1, *plan2, fkernel);
+          }
+        }
+      if constexpr (vlen>2)
+        if constexpr (simd_exists<T,vlen/2>)
+          if (it.remaining()>=vlen/2)
+            {
+            TmpStorage2<add_vec_t<T, vlen/2>,T,T0> storage2(storage);
+            it.advance(vlen/2);
+            exec(it, in, out, storage2, *plan1, *plan2, fkernel);
+            }
+      if constexpr (vlen>4)
+        if constexpr (simd_exists<T,vlen/4>)
+          if (it.remaining()>=vlen/4)
+            {
+            TmpStorage2<add_vec_t<T, vlen/4>,T,T0> storage2(storage);
+            it.advance(vlen/4);
+            exec(it, in, out, storage2, *plan1, *plan2, fkernel);
+            }
+#endif
+      {
+      TmpStorage2<T,T,T0> storage2(storage);
+      while (it.remaining()>0)
+        {
+        it.advance(1);
+        exec(it, in, out, storage2, *plan1, *plan2, fkernel);
+        }
+      }
+    });  // end of parallel region
+  }
+
+struct ExecConv1R
+  {
+  template <typename T0, typename Tstorage, typename Titer> void operator() (
+    const Titer &it, const cfmav<T0> &in, const vfmav<T0> &out,
+    Tstorage &storage, const pocketfft_r<T0> &plan1, const pocketfft_r<T0> &plan2,
+    const cmav<T0,1> &fkernel) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t l_in = plan1.length(),
+           l_out = plan2.length(),
+           l_min = std::min(l_in, l_out);
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    plan1.exec_copyback(buf2, buf1, T0(1), true);
+    auto res = buf2;
+    {
+    res[0] *= fkernel(0);
+    size_t i;
+    for (i=1; 2*i<l_min; ++i)
+      {
+      Cmplx<T> t1(res[2*i-1], res[2*i]);
+      Cmplx<T0> t2(fkernel(2*i-1), fkernel(2*i));
+      auto t3 = t1*t2;
+      res[2*i-1] = t3.r;
+      res[2*i] = t3.i;
+      }
+    if (2*i==l_min)
+      {
+      if (l_min<l_out) // padding
+        res[2*i-1] *= fkernel(2*i-1)*T0(0.5);
+      else if (l_min<l_in) // truncation
+        {
+        Cmplx<T> t1(res[2*i-1], res[2*i]);
+        Cmplx<T0> t2(fkernel(2*i-1), fkernel(2*i));
+        res[2*i-1] = (t1*t2).r*T0(2);
+        }
+      else
+        res[2*i-1] *= fkernel(2*i-1);
+      }
+    }
+    for (size_t i=l_in; i<l_out; ++i) res[i] = T(0);
+    res = plan2.exec(res, buf1, T0(1), false);
+    copy_output(it, res, out);
+    }
+  };
+struct ExecConv1C
+  {
+  template <typename T0, typename Tstorage, typename Titer> void operator() (
+    const Titer &it, const cfmav<Cmplx<T0>> &in, const vfmav<Cmplx<T0>> &out,
+    Tstorage &storage, const pocketfft_c<T0> &plan1, const pocketfft_c<T0> &plan2,
+    const cmav<Cmplx<T0>,1> &fkernel) const
+    {
+    using T = typename Tstorage::datatype;
+    size_t l_in = plan1.length(),
+           l_out = plan2.length(),
+           l_min = std::min(l_in, l_out);
+    T *buf1=storage.transformBuf(), *buf2=storage.dataBuf();
+    copy_input(it, in, buf2);
+    auto res = plan1.exec(buf2, buf1, T0(1), true);
+    auto res2 = buf2+l_in;
+    {
+    res2[0] = res[0]*fkernel(0);
+    size_t i;
+    for (i=1; 2*i<l_min; ++i)
+      {
+      res2[i] = res[i]*fkernel(i);
+      res2[l_out-i] = res[l_in-i]*fkernel(l_in-i);
+      }
+    if (2*i==l_min)
+      {
+      if (l_min<l_out) // padding
+        res2[l_out-i] = res2[i] = res[i]*fkernel(i)*T0(.5);
+      else if (l_min<l_in) // truncation
+        res2[i] = res[i]*fkernel(i) + res[l_in-i]*fkernel(l_in-i);
+      else
+        res2[i] = res[i]*fkernel(i);
+      ++i;
+      }
+    for (; 2*i<=l_out; ++i)
+      res2[i] = res2[l_out-i] = T(0,0);
+    }
+    res = plan2.exec(res2, buf1, T0(1), false);
+    copy_output(it, res, out);
+    }
+  };
+
+template<typename T> DUCC0_NOINLINE void convolve_axis(const cfmav<T> &in,
+  const vfmav<T> &out, size_t axis, const cmav<T,1> &kernel, size_t nthreads)
+  {
+  MR_assert(axis<in.ndim(), "bad axis number");
+  MR_assert(in.ndim()==out.ndim(), "dimensionality mismatch");
+  if (in.data()==out.data())
+    MR_assert(in.stride()==out.stride(), "strides mismatch");
+  for (size_t i=0; i<in.ndim(); ++i)
+    if (i!=axis)
+      MR_assert(in.shape(i)==out.shape(i), "shape mismatch");
+  if (in.size()==0) return;
+  general_convolve_axis<pocketfft_r<T>, T>(in, out, axis, kernel, nthreads,
+    ExecConv1R());
+  }
+template<typename T> DUCC0_NOINLINE void convolve_axis(const cfmav<complex<T>> &in,
+  const vfmav<complex<T>> &out, size_t axis, const cmav<complex<T>,1> &kernel,
+  size_t nthreads)
+  {
+  MR_assert(axis<in.ndim(), "bad axis number");
+  MR_assert(in.ndim()==out.ndim(), "dimensionality mismatch");
+  if (in.data()==out.data())
+    MR_assert(in.stride()==out.stride(), "strides mismatch");
+  for (size_t i=0; i<in.ndim(); ++i)
+    if (i!=axis)
+      MR_assert(in.shape(i)==out.shape(i), "shape mismatch");
+  if (in.size()==0) return;
+  const auto &in2(reinterpret_cast<const cfmav<Cmplx<T>>&>(in));
+  const auto &out2(reinterpret_cast<const vfmav<Cmplx<T>>&>(out));
+  const auto &kernel2(reinterpret_cast<const cmav<Cmplx<T>,1>&>(kernel));
+  general_convolve_axis<pocketfft_c<T>, T>(in2, out2, axis, kernel2, nthreads,
+    ExecConv1C());
+  }
+
+} // namespace detail_fft
+
+} // namespace ducc0
+
+#endif // POCKETFFT_HDRONLY_H
diff --git a/contrib/ducc0/infra/aligned_array.h b/contrib/ducc0/infra/aligned_array.h
new file mode 100644
index 000000000..f2fc9835e
--- /dev/null
+++ b/contrib/ducc0/infra/aligned_array.h
@@ -0,0 +1,175 @@
+/** \file ducc0/infra/aligned_array.h
+ *
+ * \copyright Copyright (C) 2019-2022 Max-Planck-Society
+ * \author Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_ALIGNED_ARRAY_H
+#define DUCC0_ALIGNED_ARRAY_H
+
+#include <cstdlib>
+#include <new>
+
+namespace ducc0 {
+
+namespace detail_aligned_array {
+
+using namespace std;
+
+// std::aligned_alloc is a bit cursed ... it doesn't exist on MacOS < 10.15
+// and in musl. Let's unconditionally work around it for now.
+//#if ((__cplusplus >= 201703L) && (!defined(__APPLE__)))
+#define DUCC0_WORKAROUND_ALIGNED_ALLOC
+//#endif
+
+/// Bare bones array class.
+/** Mostly useful for uninitialized temporary buffers.
+ *  \note Since this class operates on raw memory, it should only be used with
+ *        POD types, and even then only with caution! */
+template<typename T, size_t alignment=alignof(T)> class array_base
+  {
+  private:
+    T *p;
+    size_t sz;
+
+    static T *ralloc(size_t num)
+      {
+      if constexpr(alignment<=alignof(max_align_t))
+        {
+        void *res = malloc(num*sizeof(T));
+        if (!res) throw bad_alloc();
+        return reinterpret_cast<T *>(res);
+        }
+      else
+        {
+        if (num==0) return nullptr;
+#if (!defined(DUCC0_WORKAROUND_ALIGNED_ALLOC))
+        // aligned_alloc requires the allocated size to be a multiple of the
+        // requested alignment, so increase size if necessary
+        void *res = aligned_alloc(alignment,((num*sizeof(T)+alignment-1)/alignment)*alignment);
+        if (!res) throw bad_alloc();
+#else // portable emulation
+        void *ptr = malloc(num*sizeof(T)+alignment);
+        if (!ptr) throw bad_alloc();
+        void *res = reinterpret_cast<void *>((reinterpret_cast<size_t>(ptr) & ~(size_t(alignment-1))) + alignment);
+        (reinterpret_cast<void**>(res))[-1] = ptr;
+#endif
+        return reinterpret_cast<T *>(res);
+        }
+      }
+    static void dealloc(T *ptr)
+      {
+      if constexpr(alignment<=alignof(max_align_t))
+        free(ptr);
+      else
+#if (!defined(DUCC0_WORKAROUND_ALIGNED_ALLOC))
+        free(ptr);
+#else
+        if (ptr) free((reinterpret_cast<void**>(ptr))[-1]);
+#endif
+      }
+
+#undef DUCC0_WORKAROUND_ALIGNED_ALLOC
+
+  public:
+    /// Creates a zero-sized array with no associated memory.
+    array_base() : p(nullptr), sz(0) {}
+    /// Creates an array with \a n entries.
+    /** \note Memory is not initialized! */
+    array_base(size_t n) : p(ralloc(n)), sz(n) {}
+    array_base(const array_base &) = delete;
+    array_base(array_base &&other)
+      : p(other.p), sz(other.sz)
+      { other.p=nullptr; other.sz=0; }
+    ~array_base() { dealloc(p); }
+
+    array_base &operator=(const array_base &) = delete;
+    array_base &operator=(array_base &&other)
+      {
+      swap(p, other.p);
+      swap(sz, other.sz);
+      return *this;
+      }
+
+    /// If \a n is different from the current size, resizes the array to hold
+    /// \a n elements.
+    /** \note No data content is copied, the new array is uninitialized! */
+    void resize(size_t n)
+      {
+      if (n==sz) return;
+      dealloc(p);
+      p = ralloc(n);
+      sz = n;
+      }
+
+    /// Returns a writeable reference to the element at index \a idx.
+    T &operator[](size_t idx) { return p[idx]; }
+    /// Returns a read-only reference to the element at index \a idx.
+    const T &operator[](size_t idx) const { return p[idx]; }
+
+    /// Returns a writeable pointer to the array data.
+    T *data() { return p; }
+    /// Returns a read-only pointer to the array data.
+    const T *data() const { return p; }
+
+    /// Returns the size of the array.
+    size_t size() const { return sz; }
+  };
+
+template<typename T> using quick_array = array_base<T>;
+template<typename T> using aligned_array = array_base<T,64>;
+
+}
+
+using detail_aligned_array::aligned_array;
+using detail_aligned_array::quick_array;
+
+}
+
+#endif
+
diff --git a/contrib/ducc0/infra/error_handling.h b/contrib/ducc0/infra/error_handling.h
new file mode 100644
index 000000000..bfd4ea17a
--- /dev/null
+++ b/contrib/ducc0/infra/error_handling.h
@@ -0,0 +1,120 @@
+/** \file ducc0/infra/error_handling.h
+ *
+ * \copyright Copyright (C) 2019-2021 Max-Planck-Society
+ * \author Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_ERROR_HANDLING_H
+#define DUCC0_ERROR_HANDLING_H
+
+#include <sstream>
+#include <stdexcept>
+#include "ducc0/infra/useful_macros.h"
+
+namespace ducc0 {
+
+namespace detail_error_handling {
+
+#if defined (__GNUC__)
+#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__, __PRETTY_FUNCTION__)
+#else
+#define DUCC0_ERROR_HANDLING_LOC_ ::ducc0::detail_error_handling::CodeLocation(__FILE__, __LINE__)
+#endif
+
+// to be replaced with std::source_location once generally available
+class CodeLocation
+  {
+  private:
+    const char *file, *func;
+    int line;
+
+  public:
+    CodeLocation(const char *file_, int line_, const char *func_=nullptr)
+      : file(file_), func(func_), line(line_) {}
+
+    inline ::std::ostream &print(::std::ostream &os) const
+      {
+      os << "\n" << file <<  ": " <<  line;
+      if (func) os << " (" << func << ")";
+      os << ":\n";
+      return os;
+      }
+  };
+
+inline ::std::ostream &operator<<(::std::ostream &os, const CodeLocation &loc)
+  { return loc.print(os); }
+
+template<typename ...Args>
+void streamDump__(::std::ostream &os, Args&&... args)
+  { (os << ... << args); }
+template<typename ...Args>
+[[noreturn]] DUCC0_NOINLINE void fail__(Args&&... args)
+  {
+  ::std::ostringstream msg; \
+  ::ducc0::detail_error_handling::streamDump__(msg, std::forward<Args>(args)...); \
+    throw ::std::runtime_error(msg.str()); \
+  }
+
+/// Throws a std::runtime_error containing the code location and the
+/// passed arguments.
+#define MR_fail(...) \
+  do { \
+    ::ducc0::detail_error_handling::fail__(DUCC0_ERROR_HANDLING_LOC_, "\n", ##__VA_ARGS__, "\n"); \
+    } while(0)
+
+/// If \a cond is false, throws a std::runtime_error containing the code
+/// location and the passed arguments.
+#define MR_assert(cond,...) \
+  do { \
+    if (cond); \
+    else { MR_fail("Assertion failure\n", ##__VA_ARGS__); } \
+    } while(0)
+
+}}
+
+#endif
diff --git a/contrib/ducc0/infra/mav.h b/contrib/ducc0/infra/mav.h
new file mode 100644
index 000000000..eef71ef5d
--- /dev/null
+++ b/contrib/ducc0/infra/mav.h
@@ -0,0 +1,1354 @@
+/*! \file ducc0/infra/mav.h
+ *  Classes for dealing with multidimensional arrays
+ *
+ *  \copyright Copyright (C) 2019-2023 Max-Planck-Society
+ *  \author Martin Reinecke
+ *  */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_MAV_H
+#define DUCC0_MAV_H
+
+#include <array>
+#include <vector>
+#include <memory>
+#include <numeric>
+#include <cstddef>
+#include <functional>
+#include <tuple>
+#include "ducc0/infra/error_handling.h"
+#include "ducc0/infra/aligned_array.h"
+#include "ducc0/infra/misc_utils.h"
+#include "ducc0/infra/threading.h"
+
+namespace ducc0 {
+
+namespace detail_mav {
+
+using namespace std;
+
+// the next line is necessary to address some sloppy name choices in hipSYCL
+using std::min, std::max;
+
+struct uninitialized_dummy {};
+constexpr uninitialized_dummy UNINITIALIZED;
+
+template<typename T> class cmembuf
+  {
+  protected:
+    shared_ptr<vector<T>> ptr;
+    shared_ptr<quick_array<T>> rawptr;
+    const T *d;
+
+    cmembuf(const T *d_, const cmembuf &other)
+      : ptr(other.ptr), rawptr(other.rawptr), d(d_) {}
+
+    // externally owned data pointer
+    cmembuf(const T *d_)
+      : d(d_) {}
+    // share another memory buffer, but read-only
+    cmembuf(const cmembuf &other)
+      : ptr(other.ptr), rawptr(other.rawptr), d(other.d) {}
+    cmembuf(size_t sz)
+      : ptr(make_shared<vector<T>>(sz)), d(ptr->data()) {}
+#if 1
+    cmembuf(size_t sz, uninitialized_dummy)
+      : rawptr(make_shared<quick_array<T>>(sz)), d(rawptr->data()) {}
+# else // "poison" the array with a fixed value; use for debugging
+    cmembuf(size_t sz, uninitialized_dummy)
+      : rawptr(make_shared<quick_array<T>>(sz)), d(rawptr->data())
+      { for (size_t i=0; i<sz; ++i) (*rawptr)[i]=T(42000000); }
+#endif
+    // take over another memory buffer
+    cmembuf(cmembuf &&other) = default;
+
+  public:
+    cmembuf(): d(nullptr) {}
+    void assign(const cmembuf &other)
+      {
+      ptr = other.ptr;
+      rawptr = other.rawptr;
+      d = other.d;
+      }
+    // read access to element #i
+    template<typename I> const T &raw(I i) const
+      { return d[i]; }
+    // read access to data area
+    const T *data() const
+      { return d; }
+  };
+
+constexpr size_t MAXIDX=~(size_t(0));
+
+struct slice
+  {
+  size_t beg, end;
+  ptrdiff_t step;
+  slice() : beg(0), end(MAXIDX), step(1) {}
+  slice(size_t idx) : beg(idx), end(idx), step(1) {}
+  slice(size_t beg_, size_t end_, ptrdiff_t step_=1)
+    : beg(beg_), end(end_), step(step_)
+    {
+// FIXME: add sanity checks here
+    }
+
+  size_t size(size_t shp) const
+    {
+    if (beg==end) return 0;
+    if (step>0) return (min(shp,end)-beg+step-1)/step;
+    // negative step
+    if (end==MAXIDX)
+      return (beg-step)/(-step);
+    return (beg-end-step-1)/(-step);
+    }
+  };
+
+/// Helper class containing shape and stride information of an `fmav` object
+class fmav_info
+  {
+  public:
+    /// vector of nonnegative integers for storing the array shape
+    using shape_t = vector<size_t>;
+    /// vector of integers for storing the array strides
+    using stride_t = vector<ptrdiff_t>;
+
+  protected:
+    shape_t shp;
+    stride_t str;
+    size_t sz;
+
+    static stride_t shape2stride(const shape_t &shp)
+      {
+      auto ndim = shp.size();
+      // MR using the static_cast just to avoid a GCC warning.
+//      stride_t res(ndim);
+      stride_t res(static_cast<int>(ndim));
+      if (ndim==0) return res;
+      res[ndim-1]=1;
+      for (size_t i=2; i<=ndim; ++i)
+        res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]);
+      return res;
+      }
+    template<typename... Ns> ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const
+      { return str[dim]*ptrdiff_t(n) + getIdx(dim+1, ns...); }
+    ptrdiff_t getIdx(size_t dim, size_t n) const
+      { return str[dim]*ptrdiff_t(n); }
+    ptrdiff_t getIdx(size_t /*dim*/) const
+      { return 0; }
+
+  public:
+    /// Constructs a 1D object with all extents and strides set to zero.
+    fmav_info() : shp(1,0), str(1,0), sz(0) {}
+    /// Constructs an object with the given shape and stride.
+    fmav_info(const shape_t &shape_, const stride_t &stride_)
+      : shp(shape_), str(stride_),
+        sz(accumulate(shp.begin(),shp.end(),size_t(1),multiplies<>()))
+      {
+      MR_assert(shp.size()==str.size(), "dimensions mismatch");
+      }
+    /// Constructs an object with the given shape and computes the strides
+    /// automatically, assuming a C-contiguous memory layout.
+    fmav_info(const shape_t &shape_)
+      : fmav_info(shape_, shape2stride(shape_)) {}
+    void assign(const fmav_info &other)
+      {
+      shp = other.shp;
+      str = other.str;
+      sz = other.sz;
+      }
+    /// Returns the dimensionality of the object.
+    size_t ndim() const { return shp.size(); }
+    /// Returns the total number of entries in the object.
+    size_t size() const { return sz; }
+    /// Returns the shape of the object.
+    const shape_t &shape() const { return shp; }
+    /// Returns the length along dimension \a i.
+    size_t shape(size_t i) const { return shp[i]; }
+    /// Returns the strides of the object.
+    const stride_t &stride() const { return str; }
+    /// Returns the stride along dimension \a i.
+    const ptrdiff_t &stride(size_t i) const { return str[i]; }
+    /// Returns true iff the last dimension has stride 1.
+    /**  Typically used for optimization purposes. */
+    bool last_contiguous() const
+      { return ((ndim()==0) || (str.back()==1)); }
+    /** Returns true iff the object is C-contiguous, i.e. if the stride of the
+     *  last dimension is 1, the stride for the next-to-last dimension is the
+     *  shape of the last dimension etc. */
+    bool contiguous() const
+      {
+      auto ndim = shp.size();
+      ptrdiff_t stride=1;
+      for (size_t i=0; i<ndim; ++i)
+        {
+        if ((shp[ndim-1-i]!=1) && (str[ndim-1-i]!=stride))
+          return false;
+        stride *= ptrdiff_t(shp[ndim-1-i]);
+        }
+      return true;
+      }
+    /// Returns true iff this->shape and \a other.shape match.
+    bool conformable(const fmav_info &other) const
+      { return shp==other.shp; }
+    /// Returns the one-dimensional index of an entry from the given
+    /// multi-dimensional index tuple, taking strides into account.
+    template<typename... Ns> ptrdiff_t idx(Ns... ns) const
+      {
+      MR_assert(ndim()==sizeof...(ns), "incorrect number of indices");
+      return getIdx(0, ns...);
+      }
+    ptrdiff_t idx(const shape_t &ns) const
+      {
+      MR_assert(ndim()==ns.size(), "incorrect number of indices");
+      size_t res = 0;
+      for (size_t i=0; i<ndim(); ++i) res += str[i]*ns[i];
+      return res;
+      }
+    template<typename RAiter> ptrdiff_t idxval(RAiter beg, RAiter end) const
+      {
+      MR_assert(ndim()==size_t(end-beg), "incorrect number of indices");
+      size_t res = 0;
+      for (size_t i=0; i<ndim(); ++i, ++beg) res += str[i]* (*beg);
+      return res;
+      }
+    /// Returns the common broadcast shape of *this and \a shp2
+    shape_t bcast_shape(const shape_t &shp2) const
+      {
+      shape_t res(max(shp.size(), shp2.size()), 1);
+      for (size_t i=0; i<shp.size(); ++i)
+        res[i+res.size()-shp.size()] = shp[i];
+      for (size_t i=0; i<shp2.size(); ++i)
+        {
+        size_t i2 = i+res.size()-shp2.size();
+        if (res[i2]==1)
+          res[i2] = shp2[i];
+        else
+          MR_assert((res[i2]==shp2[i])||(shp2[i]==1),
+            "arrays cannot be broadcast together");
+        }
+      return res;
+      }
+    void bcast_to_shape(const shape_t &shp2)
+      {
+      MR_assert(shp2.size()>=shp.size(), "cannot reduce dimensionality");
+      stride_t newstr(shp2.size(), 0);
+      for (size_t i=0; i<shp.size(); ++i)
+        {
+        size_t i2 = i+shp2.size()-shp.size();
+        if (shp[i]!=1)
+          {
+          MR_assert(shp[i]==shp2[i2], "arrays cannot be broadcast together");
+          newstr[i2] = str[i];
+          }
+        }
+      shp = shp2;
+      str = newstr;
+      }
+
+    void swap_axes(size_t ax0, size_t ax1)
+      {
+      MR_assert(ax0<=ndim() && ax1<=ndim(), "bad axes");
+      if (ax0==ax1) return;
+      swap(shp[ax0], shp[ax1]);
+      swap(str[ax0], str[ax1]);
+      }
+
+    fmav_info extend_and_broadcast(const shape_t &new_shape,
+      const shape_t &axpos) const
+      {
+      MR_assert(new_shape.size()>=ndim(),
+        "new shape smaller than original one");
+      MR_assert(axpos.size()==ndim(), "bad axpos size");
+      stride_t new_stride(new_shape.size(), 0);
+      vector<uint8_t> used(new_shape.size(),0);
+      for (size_t i=0; i<ndim(); ++i)
+        {
+        MR_assert(axpos[i]<new_shape.size(), "bad axis number");
+        MR_assert(shp[i]==new_shape[axpos[i]], "axis length nismatch");
+        MR_assert(used[axpos[i]]==0, "repeated axis position");
+        used[axpos[i]]=1;
+        new_stride[axpos[i]] = str[i];
+        }
+      return fmav_info(new_shape, new_stride);
+      }
+    fmav_info extend_and_broadcast(const shape_t &new_shape,
+      size_t firstaxis) const
+      {
+      shape_t axpos(ndim());
+      std::iota(axpos.begin(), axpos.end(), firstaxis);
+      return extend_and_broadcast(new_shape, axpos);
+      }
+    fmav_info transpose() const
+      {
+      return fmav_info({shp.crend(), shp.crbegin()}, {str.crbegin(), str.crend()});
+      }
+  protected:
+    auto subdata(const vector<slice> &slices) const
+      {
+      auto ndim = shp.size();
+      shape_t nshp(ndim);
+      stride_t nstr(ndim);
+      MR_assert(slices.size()==ndim, "incorrect number of slices");
+      size_t n0=0;
+      for (auto x:slices) if (x.beg==x.end) ++n0;
+      ptrdiff_t nofs=0;
+      nshp.resize(ndim-n0);
+      nstr.resize(ndim-n0);
+      for (size_t i=0, i2=0; i<ndim; ++i)
+        {
+        MR_assert(slices[i].beg<shp[i], "bad subset");
+        nofs+=slices[i].beg*str[i];
+        if (slices[i].beg!=slices[i].end)
+          {
+          auto ext = slices[i].size(shp[i]);
+          MR_assert(slices[i].beg+(ext-1)*slices[i].step<shp[i], "bad subset");
+          nshp[i2]=ext; nstr[i2]=slices[i].step*str[i];
+          ++i2;
+          }
+        }
+      return make_tuple(fmav_info(nshp, nstr), nofs);
+      }
+  };
+
+/// Helper class containing shape and stride information of a `mav` object
+template<size_t ndim> class mav_info
+  {
+  public:
+    /// Fixed-size array of nonnegative integers for storing the array shape
+    using shape_t = array<size_t, ndim>;
+    /// Fixed-size array of integers for storing the array strides
+    using stride_t = array<ptrdiff_t, ndim>;
+
+  protected:
+    shape_t shp;
+    stride_t str;
+    size_t sz;
+
+    static stride_t shape2stride(const shape_t &shp)
+      {
+      stride_t res;
+      if (ndim==0) return res;
+      res[ndim-1]=1;
+      for (size_t i=2; i<=ndim; ++i)
+        res[ndim-i] = res[ndim-i+1]*ptrdiff_t(shp[ndim-i+1]);
+      return res;
+      }
+    template<typename... Ns> ptrdiff_t getIdx(size_t dim, size_t n, Ns... ns) const
+      { return str[dim]*n + getIdx(dim+1, ns...); }
+    ptrdiff_t getIdx(size_t dim, size_t n) const
+      { return str[dim]*n; }
+    ptrdiff_t getIdx(size_t /*dim*/) const
+      { return 0; }
+
+  public:
+    /// Constructs an object with all extents and strides set to zero.
+    mav_info() : sz(0)
+      {
+      for (size_t i=0; i<ndim; ++i)
+        { shp[i]=0; str[i]=0; }
+      }
+    /// Constructs an object with the given shape and stride.
+    mav_info(const shape_t &shape_, const stride_t &stride_)
+      : shp(shape_), str(stride_),
+        sz(accumulate(shp.begin(),shp.end(),size_t(1),multiplies<>())) {}
+    /// Constructs an object with the given shape and computes the strides
+    /// automatically, assuming a C-contiguous memory layout.
+    mav_info(const shape_t &shape_)
+      : mav_info(shape_, shape2stride(shape_)) {}
+    mav_info(const fmav_info &inp)
+      {
+      MR_assert(inp.ndim()==ndim, "dimensionality mismatch");
+      sz=1;
+      for (size_t i=0; i<ndim; ++i)
+        {
+        shp[i] = inp.shape(i);
+        sz *= shp[i];
+        str[i] = inp.stride(i);
+        }
+      }
+    void assign(const mav_info &other)
+      {
+      shp = other.shp;
+      str = other.str;
+      sz = other.sz;
+      }
+    /// Returns the total number of entries in the object.
+    size_t size() const { return sz; }
+    /// Returns the shape of the object.
+    const shape_t &shape() const { return shp; }
+    /// Returns the length along dimension \a i.
+    size_t shape(size_t i) const { return shp[i]; }
+    /// Returns the strides of the object.
+    const stride_t &stride() const { return str; }
+    /// Returns the stride along dimension \a i.
+    const ptrdiff_t &stride(size_t i) const { return str[i]; }
+    /// Returns true iff the last dimension has stride 1.
+    /**  Typically used for optimization purposes. */
+    bool last_contiguous() const
+      { return ((ndim==0) || (str.back()==1)); }
+    /** Returns true iff the object is C-contiguous, i.e. if the stride of the
+     *  last dimension is 1, the stride for the next-to-last dimension is the
+     *  shape of the last dimension etc. */
+    bool contiguous() const
+      {
+      ptrdiff_t stride=1;
+      for (size_t i=0; i<ndim; ++i)
+        {
+        if ((shp[ndim-1-i]!=1) && (str[ndim-1-i]!=stride))
+          return false;
+        stride *= ptrdiff_t(shp[ndim-1-i]);
+        }
+      return true;
+      }
+    /// Returns true iff this->shape and \a other.shape match.
+    bool conformable(const mav_info &other) const
+      { return shp==other.shp; }
+    /// Returns true iff this->shape and \a other match.
+    bool conformable(const shape_t &other) const
+      { return shp==other; }
+    /// Returns the one-dimensional index of an entry from the given
+    /// multi-dimensional index tuple, taking strides into account.
+    template<typename... Ns> ptrdiff_t idx(Ns... ns) const
+      {
+      static_assert(ndim==sizeof...(ns), "incorrect number of indices");
+      return getIdx(0, ns...);
+      }
+    mav_info transpose() const
+      {
+      shape_t shp2;
+      stride_t str2;
+      for (size_t i=0; i<ndim; ++i)
+        {
+        shp2[i] = shp[ndim-1-i];
+        str2[i] = str[ndim-1-i];
+        }
+      return mav_info(shp2, str2);
+      }
+    mav_info<ndim+1> prepend_1() const
+      {
+      typename mav_info<ndim+1>::shape_t newshp;
+      typename mav_info<ndim+1>::stride_t newstr;
+      newshp[0] = 1;
+      newstr[0] = 0;
+      for (size_t i=0; i<ndim; ++i)
+        {
+        newshp[i+1] = shp[i];
+        newstr[i+1] = str[i];
+        }
+      return mav_info<ndim+1>(newshp, newstr);
+      }
+
+  protected:
+    template<size_t nd2> auto subdata(const vector<slice> &slices) const
+      {
+      MR_assert(slices.size()==ndim, "bad number of slices");
+      array<size_t, nd2> nshp;
+      array<ptrdiff_t, nd2> nstr;
+
+      // unnecessary, but gcc warns otherwise
+      for (size_t i=0; i<nd2; ++i) nshp[i]=nstr[i]=0;
+
+      size_t n0=0;
+      for (auto x:slices) if (x.beg==x.end) ++n0;
+      MR_assert(n0+nd2==ndim, "bad extent");
+      ptrdiff_t nofs=0;
+      for (size_t i=0, i2=0; i<ndim; ++i)
+        {
+        MR_assert(slices[i].beg<shp[i], "bad subset");
+        nofs+=slices[i].beg*str[i];
+        if (slices[i].beg!=slices[i].end)
+          {
+          auto ext = slices[i].size(shp[i]);
+          MR_assert(slices[i].beg+(ext-1)*slices[i].step<shp[i], "bad subset");
+          nshp[i2]=ext; nstr[i2]=slices[i].step*str[i];
+          ++i2;
+          }
+        }
+      return make_tuple(mav_info<nd2>(nshp, nstr), nofs);
+      }
+  };
+
+template<typename T> class cfmav: public fmav_info, public cmembuf<T>
+  {
+  protected:
+    using tbuf = cmembuf<T>;
+    using tinfo = fmav_info;
+    using fmav_info::idx;
+
+  public:
+    using typename tinfo::shape_t;
+    using typename tinfo::stride_t;
+    using tbuf::raw, tbuf::data;
+
+
+  protected:
+    cfmav(const shape_t &shp_)
+      : tinfo(shp_), tbuf(size()) {}
+    cfmav(const shape_t &shp_, uninitialized_dummy)
+      : tinfo(shp_), tbuf(size(), UNINITIALIZED) {}
+    cfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy)
+      : tinfo(shp_, str_), tbuf(size(), UNINITIALIZED)
+      {
+      ptrdiff_t ofs=0;
+      for (size_t i=0; i<ndim(); ++i)
+        ofs += (ptrdiff_t(shp[i])-1)*str[i];
+      MR_assert(ofs+1==ptrdiff_t(size()), "array is not compact");
+      }
+    cfmav(const fmav_info &info, const tbuf &buf)
+      : tinfo(info), tbuf(buf) {}
+    cfmav(const fmav_info &info, const T *d_, const tbuf &buf)
+      : tinfo(info), tbuf(d_, buf) {}
+
+  public:
+    cfmav() {}
+    cfmav(const T *d_, const shape_t &shp_, const stride_t &str_)
+      : tinfo(shp_, str_), tbuf(d_) {}
+    cfmav(const T *d_, const shape_t &shp_)
+      : tinfo(shp_), tbuf(d_) {}
+    cfmav(const T* d_, const tinfo &info)
+      : tinfo(info), tbuf(d_) {}
+
+    cfmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_)
+      : tinfo(shp_, str_), tbuf(buf) {}
+
+    // no-op. Needed for template tricks.
+    cfmav to_fmav() const { return *this; }
+
+    void assign(const cfmav &other)
+      {
+      tinfo::assign(other);
+      tbuf::assign(other);
+      }
+
+    /// Returns the data entry at the given set of indices.
+    template<typename... Ns> const T &operator()(Ns... ns) const
+      { return raw(idx(ns...)); }
+    const T &operator()(const shape_t &ns) const
+      { return raw(idx(ns)); }
+    template<typename RAiter> const T& val(RAiter beg, RAiter end) const
+      { return raw(idxval(beg, end)); }
+
+    cfmav subarray(const vector<slice> &slices) const
+      {
+      auto [ninfo, nofs] = subdata(slices);
+      return cfmav(ninfo, tbuf::d+nofs, *this);
+      }
+    cfmav extend_and_broadcast(const shape_t &new_shape, const shape_t &axpos) const
+      {
+      return cfmav(fmav_info::extend_and_broadcast(new_shape, axpos), *this);
+      }
+    cfmav extend_and_broadcast(const shape_t &new_shape, size_t firstaxis) const
+      {
+      return cfmav(fmav_info::extend_and_broadcast(new_shape, firstaxis), *this);
+      }
+    cfmav transpose() const
+      {
+      return cfmav(static_cast<const tinfo *>(this)->transpose(), *static_cast<const tbuf *>(this));
+      }
+  };
+
+template<typename T> cfmav<T> subarray
+  (const cfmav<T> &arr, const vector<slice> &slices)  
+  { return arr.subarray(slices); }
+
+template<typename T> class vfmav: public cfmav<T>
+  {
+  protected:
+    using tbuf = cmembuf<T>;
+    using tinfo = fmav_info;
+    using tinfo::shp, tinfo::str;
+    using fmav_info::idx;
+
+  public:
+    using typename tinfo::shape_t;
+    using typename tinfo::stride_t;
+    using tinfo::size, tinfo::shape, tinfo::stride;
+
+  protected:
+    vfmav(const fmav_info &info, const tbuf &buf)
+      : cfmav<T>(info, buf) {}
+    vfmav(const fmav_info &info, T *d_, const tbuf &buf)
+      : cfmav<T>(info, d_, buf) {}
+
+  public:
+    using tbuf::raw, tbuf::data, tinfo::ndim;
+    vfmav() {}
+    vfmav(T *d_, const fmav_info &info)
+      : cfmav<T>(d_, info) {}
+    vfmav(T *d_, const shape_t &shp_, const stride_t &str_)
+      : cfmav<T>(d_, shp_, str_) {}
+    vfmav(T *d_, const shape_t &shp_)
+      : cfmav<T>(d_, shp_) {}
+    vfmav(const shape_t &shp_)
+      : cfmav<T>(shp_) {}
+    vfmav(const shape_t &shp_, uninitialized_dummy)
+      : cfmav<T>(shp_, UNINITIALIZED) {}
+    vfmav(const shape_t &shp_, const stride_t &str_, uninitialized_dummy)
+      : cfmav<T>(shp_, str_, UNINITIALIZED)
+      {
+      ptrdiff_t ofs=0;
+      for (size_t i=0; i<ndim(); ++i)
+        ofs += (ptrdiff_t(shp[i])-1)*str[i];
+      MR_assert(ofs+1==ptrdiff_t(size()), "array is not compact");
+      }
+    vfmav(tbuf &buf, const shape_t &shp_, const stride_t &str_)
+      : cfmav<T>(buf, shp_, str_) {}
+
+    T *data() const
+     { return const_cast<T *>(tbuf::d); }
+    template<typename I> T &raw(I i) const
+      { return data()[i]; }
+
+    // no-op. Needed for template tricks.
+    using cfmav<T>::to_fmav;
+    vfmav to_fmav() const { return *this; }
+
+    void assign(const vfmav &other)
+      {
+      fmav_info::assign(other);
+      cmembuf<T>::assign(other);
+      }
+
+    using cfmav<T>::operator();
+    template<typename... Ns> T &operator()(Ns... ns) const
+      { return raw(idx(ns...)); }
+    T &operator()(const shape_t &ns) const
+      { return raw(idx(ns)); }
+    using cfmav<T>::val;
+    template<typename RAiter> T& val(RAiter beg, RAiter end) const
+      { return raw(idxval(beg, end)); }
+
+    vfmav subarray(const vector<slice> &slices) const
+      {
+      auto [ninfo, nofs] = tinfo::subdata(slices);
+      return vfmav(ninfo, data()+nofs, *this);
+      }
+    /** Returns a writable fmav with the specified shape.
+     *  The strides are chosen in such a way that critical strides (multiples
+     *  of 4096 bytes) along any dimension are avoided, by enlarging the
+     *  allocated memory slightly if necessary.
+     *  The array data is default-initialized. */
+    static vfmav build_noncritical(const shape_t &shape)
+      {
+      auto ndim = shape.size();
+      auto shape2 = noncritical_shape(shape, sizeof(T));
+      vfmav tmp(shape2);
+      vector<slice> slc(ndim);
+      for (size_t i=0; i<ndim; ++i) slc[i] = slice(0, shape[i]);
+      return tmp.subarray(slc);
+      }
+    /** Returns a writable fmav with the specified shape.
+     *  The strides are chosen in such a way that critical strides (multiples
+     *  of 4096 bytes) along any dimension are avoided, by enlarging the
+     *  allocated memory slightly if necessary.
+     *  The array data is not initialized. */
+    static vfmav build_noncritical(const shape_t &shape, uninitialized_dummy)
+      {
+      auto ndim = shape.size();
+      if (ndim<=1) return vfmav(shape, UNINITIALIZED);
+      auto shape2 = noncritical_shape(shape, sizeof(T));
+      vfmav tmp(shape2, UNINITIALIZED);
+      vector<slice> slc(ndim);
+      for (size_t i=0; i<ndim; ++i) slc[i] = slice(0, shape[i]);
+      return tmp.subarray(slc);
+      }
+    vfmav extend_and_broadcast(const shape_t &new_shape, const shape_t &axpos) const
+      {
+      return vfmav(fmav_info::extend_and_broadcast(new_shape, axpos), *this);
+      }
+    vfmav extend_and_broadcast(const shape_t &new_shape, size_t firstaxis) const
+      {
+      return vfmav(fmav_info::extend_and_broadcast(new_shape, firstaxis), *this);
+      }
+    vfmav transpose() const
+      {
+      return vfmav(static_cast<tinfo *>(this)->transpose(), *static_cast<tbuf *>(this));
+      }
+  };
+
+template<typename T> vfmav<T> subarray
+  (const vfmav<T> &arr, const vector<slice> &slices)  
+  { return arr.subarray(slices); }
+
+template<typename T, size_t ndim> class cmav: public mav_info<ndim>, public cmembuf<T>
+  {
+  protected:
+    template<typename T2, size_t nd2> friend class cmav;
+    template<typename T2, size_t nd2> friend class vmav;
+
+    using tinfo = mav_info<ndim>;
+    using tbuf = cmembuf<T>;
+    using tinfo::shp, tinfo::str;
+
+  public:
+    using typename tinfo::shape_t;
+    using typename tinfo::stride_t;
+    using tbuf::raw, tbuf::data;
+    using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable;
+
+  protected:
+    cmav() {}
+    cmav(const shape_t &shp_, uninitialized_dummy)
+      : tinfo(shp_), tbuf(size(), UNINITIALIZED) {}
+    cmav(const shape_t &shp_)
+      : tinfo(shp_), tbuf(size()) {}
+    cmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_)
+      : tinfo(shp_, str_), tbuf(buf) {}
+    cmav(const tinfo &info, const T *d_, const tbuf &buf)
+      : tinfo(info), tbuf(d_, buf) {}
+    cmav(const tinfo &info, const tbuf &buf)
+      : tinfo(info), tbuf(buf) {}
+
+  public:
+    cmav(const T *d_, const shape_t &shp_, const stride_t &str_)
+      : tinfo(shp_, str_), tbuf(d_) {}
+    cmav(const T *d_, const shape_t &shp_)
+      : tinfo(shp_), tbuf(d_) {}
+    cmav(const cfmav<T> &inp)
+      : tinfo(inp), tbuf(inp) {}
+    void assign(const cmav &other)
+      {
+      mav_info<ndim>::assign(other);
+      cmembuf<T>::assign(other);
+      }
+    operator cfmav<T>() const
+      {
+      return cfmav<T>(*this, {shp.begin(), shp.end()}, {str.begin(), str.end()});
+      }
+    // Needed for template tricks.
+    cfmav<T> to_fmav() const { return operator cfmav<T>(); }
+
+    template<typename... Ns> const T &operator()(Ns... ns) const
+      { return raw(idx(ns...)); }
+    template<size_t nd2> cmav<T,nd2> subarray(const vector<slice> &slices) const
+      {
+      auto [ninfo, nofs] = tinfo::template subdata<nd2> (slices);
+      return cmav<T,nd2> (ninfo, tbuf::d+nofs, *this);
+      }
+
+    static cmav build_uniform(const shape_t &shape, const T &value)
+      {
+      // Don't do this at home!
+      shape_t tshp;
+      tshp.fill(1);
+      cmav tmp(tshp);
+      const_cast<T &>(tmp.raw(0)) = value;
+      stride_t nstr;
+      nstr.fill(0);
+      return cmav(tmp, shape, nstr);
+      }
+    cmav transpose() const
+      {
+      return cmav(static_cast<const tinfo *>(this)->transpose(), *static_cast<const tbuf *>(this));
+      }
+    cmav<T, ndim+1> prepend_1() const
+      {
+      return cmav<T, ndim+1>(static_cast<const tinfo *>(this)->prepend_1(), *static_cast<const tbuf *>(this));
+      }
+    template<size_t ndim2> cmav<T, ndim2> reinterpret
+      (const typename cmav<T, ndim2>::shape_t &newshp,
+       const typename cmav<T, ndim2>::stride_t &newstr) const
+      {
+      return cmav<T, ndim2>(*static_cast<const tbuf *>(this), newshp, newstr);
+      }
+  };
+template<size_t nd2, typename T, size_t ndim> cmav<T,nd2> subarray
+  (const cmav<T, ndim> &arr, const vector<slice> &slices)  
+  { return arr.template subarray<nd2>(slices); }
+
+template<typename T, size_t ndim> class vmav: public cmav<T, ndim>
+  {
+  protected:
+    template<typename T2, size_t nd2> friend class vmav;
+
+    using parent = cmav<T, ndim>;
+    using tinfo = mav_info<ndim>;
+    using tbuf = cmembuf<T>;
+    using tinfo::shp, tinfo::str;
+
+  public:
+    using typename tinfo::shape_t;
+    using typename tinfo::stride_t;
+    using tbuf::raw, tbuf::data;
+    using tinfo::contiguous, tinfo::size, tinfo::idx, tinfo::conformable;
+
+  protected:
+    vmav(const tinfo &info, T *d_, const tbuf &buf)
+      : parent(info, d_, buf) {}
+    vmav(const tinfo &info, const tbuf &buf)
+      : parent(info, buf) {}
+    vmav(const tbuf &buf, const shape_t &shp_, const stride_t &str_)
+      : parent(buf, shp_, str_){}
+
+  public:
+    vmav() {}
+    vmav(T *d_, const shape_t &shp_, const stride_t &str_)
+      : parent(d_, shp_, str_) {}
+    vmav(T *d_, const shape_t &shp_)
+      : parent(d_, shp_) {}
+    vmav(const shape_t &shp_)
+      : parent(shp_) {}
+    vmav(const shape_t &shp_, uninitialized_dummy)
+      : parent(shp_, UNINITIALIZED) {}
+    vmav(const vfmav<T> &inp)
+      : parent(inp) {}
+      
+    void assign(vmav &other)
+      { parent::assign(other); }
+    void dealloc()
+      {
+      vmav empty;
+      assign(empty);
+      }
+    operator vfmav<T>() const
+      {
+      return vfmav<T>(*const_cast<tbuf *>(static_cast<const tbuf *>(this)), {shp.begin(), shp.end()}, {str.begin(), str.end()});
+      }
+    // Needed for template tricks.
+    using cmav<T, ndim>::to_fmav;
+    vfmav<T> to_fmav() const { return operator vfmav<T>(); }
+
+    using parent::operator();
+    template<typename... Ns> T &operator()(Ns... ns) const
+      { return const_cast<T &>(parent::operator()(ns...)); }
+
+    template<size_t nd2> vmav<T,nd2> subarray(const vector<slice> &slices) const
+      {
+      auto [ninfo, nofs] = tinfo::template subdata<nd2> (slices);
+      return vmav<T,nd2> (ninfo, data()+nofs, *this);
+      }
+
+    T *data() const
+     { return const_cast<T *>(tbuf::d); }
+    // read access to element #i
+    template<typename I> T &raw(I i) const
+      { return data()[i]; }
+
+    static vmav build_empty()
+      {
+      shape_t nshp;
+      nshp.fill(0);
+      return vmav(static_cast<T *>(nullptr), nshp);
+      }
+
+    static vmav build_noncritical(const shape_t &shape)
+      {
+      auto shape2 = noncritical_shape(shape, sizeof(T));
+      vmav tmp(shape2);
+      vector<slice> slc(ndim);
+      for (size_t i=0; i<ndim; ++i) slc[i] = slice(0, shape[i]);
+      return tmp.subarray<ndim>(slc);
+      }
+    static vmav build_noncritical(const shape_t &shape, uninitialized_dummy)
+      {
+      if (ndim<=1) return vmav(shape, UNINITIALIZED);
+      auto shape2 = noncritical_shape(shape, sizeof(T));
+      vmav tmp(shape2, UNINITIALIZED);
+      vector<slice> slc(ndim);
+      for (size_t i=0; i<ndim; ++i) slc[i] = slice(0, shape[i]);
+      return tmp.subarray<ndim>(slc);
+      }
+    vmav transpose() const
+      {
+      return vmav(static_cast<const tinfo *>(this)->transpose(), *static_cast<const tbuf *>(this));
+      }
+    vmav<T, ndim+1> prepend_1() const
+      {
+      return vmav<T, ndim+1>(static_cast<const tinfo *>(this)->prepend_1(), *static_cast<const tbuf *>(this));
+      }
+    template<size_t ndim2> vmav<T, ndim2> reinterpret
+      (const typename vmav<T, ndim2>::shape_t &newshp,
+       const typename vmav<T, ndim2>::stride_t &newstr) const
+      {
+      return vmav<T, ndim2>(*static_cast<const tbuf *>(this), newshp, newstr);
+      }
+  };
+
+template<size_t nd2, typename T, size_t ndim> vmav<T,nd2> subarray
+  (const vmav<T, ndim> &arr, const vector<slice> &slices)  
+  { return arr.template subarray<nd2>(slices); }
+
+// various operations involving fmav objects of the same shape -- experimental
+
+DUCC0_NOINLINE tuple<fmav_info::shape_t, vector<fmav_info::stride_t>, size_t, size_t>
+  multiprep(const vector<fmav_info> &info, const vector<size_t> &tsizes);
+DUCC0_NOINLINE tuple<fmav_info::shape_t, vector<fmav_info::stride_t>>
+  multiprep(const vector<fmav_info> &info);
+
+template<typename Ttuple> constexpr inline size_t tuplelike_size()
+  { return tuple_size_v<remove_reference_t<Ttuple>>; }
+
+template <typename Func, typename Ttuple, size_t... I>
+inline void call_with_tuple_impl(Func &&func, const Ttuple& tuple,
+  index_sequence<I...>)
+  { func(std::forward<typename tuple_element<I, Ttuple>::type>(get<I>(tuple))...); }
+template<typename Func, typename Ttuple> inline void call_with_tuple
+  (Func &&func, Ttuple &&tuple)
+  {
+  call_with_tuple_impl(std::forward<Func>(func), tuple,
+                       make_index_sequence<tuplelike_size<Ttuple>()>());
+  }
+template <typename Func, typename Ttuple, size_t... I>
+inline void call_with_tuple2_impl(Func &&func, const Ttuple& tuple,
+  index_sequence<I...>)
+  { func(get<I>(tuple)...); }
+template<typename Func, typename Ttuple> inline void call_with_tuple2
+  (Func &&func, Ttuple &&tuple)
+  {
+  call_with_tuple2_impl(std::forward<Func>(func), tuple,
+                        make_index_sequence<tuplelike_size<Ttuple>()>());
+  }
+
+template<typename...Ts, typename Func, size_t... Is>
+inline auto tuple_transform_impl(tuple<Ts...> const& inputs, Func &&func,
+  index_sequence<Is...>)
+  { return tuple<result_of_t<Func(Ts)>...>{func(get<Is>(inputs))...}; }
+template<typename... Ts, typename Func>
+inline auto tuple_transform(tuple<Ts...> const& inputs, Func &&func)
+  {
+  return tuple_transform_impl(inputs, std::forward<Func>(func),
+                              make_index_sequence<sizeof...(Ts)>{});
+  }
+template<typename...Ts, typename Func, size_t... Is>
+inline void tuple_for_each_impl(tuple<Ts...> &tpl, Func &&func,
+  index_sequence<Is...>)
+  { (func(get<Is>(tpl)), ...); }
+template<typename... Ts, typename Func>
+inline void tuple_for_each(tuple<Ts...> &tpl, Func &&func)
+  {
+  tuple_for_each_impl(tpl, std::forward<Func>(func), make_index_sequence<sizeof...(Ts)>{});
+  }
+template<typename...Ts, typename Func, size_t... Is>
+inline void tuple_for_each_impl(const tuple<Ts...> &tpl, Func &&func,
+  index_sequence<Is...>)
+  { (func(get<Is>(tpl)), ...); }
+template<typename... Ts, typename Func>
+inline void tuple_for_each(const tuple<Ts...> &tpl, Func &&func)
+  {
+  tuple_for_each_impl(tpl, std::forward<Func>(func), make_index_sequence<sizeof...(Ts)>{});
+  }
+
+template<typename...Ts, typename Func, size_t... Is>
+inline auto tuple_transform_idx_impl(const tuple<Ts...> &inputs,
+   Func &&func, index_sequence<Is...>)
+  {
+  return tuple<result_of_t<Func(Ts, int)>...>
+    {func(get<Is>(inputs), Is)...};
+  }
+
+template<typename... Ts, typename Func>
+inline auto tuple_transform_idx(const tuple<Ts...> &inputs, Func &&func)
+  {
+  return tuple_transform_idx_impl(inputs, std::forward<Func>(func),
+                                  make_index_sequence<sizeof...(Ts)>{});
+  }
+template<typename...Ts, typename Func, size_t... Is>
+inline void tuple_for_each_idx_impl(tuple<Ts...> &tpl, Func &&func,
+  index_sequence<Is...>)
+  { (func(get<Is>(tpl), Is), ...); }
+template<typename... Ts, typename Func>
+inline void tuple_for_each_idx(tuple<Ts...> &tpl, Func &&func)
+  {
+  tuple_for_each_idx_impl(tpl, std::forward<Func>(func), make_index_sequence<sizeof...(Ts)>{});
+  }
+
+template<typename Ttuple> inline auto to_ref (const Ttuple &tuple)
+  {
+  return tuple_transform(tuple,[](auto &&ptr) -> typename std::add_lvalue_reference_t<decltype(*ptr)>{ return *ptr; });
+  }
+
+template<typename Ttuple> inline Ttuple update_pointers (const Ttuple &ptrs,
+  const vector<vector<ptrdiff_t>> &str, size_t idim, size_t i)
+  {
+  return tuple_transform_idx(ptrs, [i,idim,&str](auto &&ptr, size_t idx)
+                             { return ptr + i*str[idx][idim]; });
+  }
+
+template<typename Ttuple> inline Ttuple update_pointers_contiguous (const Ttuple &ptrs,
+  size_t i)
+  {
+  return tuple_transform(ptrs, [i](auto &&ptr) { return ptr+i; });
+  }
+template<typename Ttuple> inline void advance_contiguous (Ttuple &ptrs)
+  { tuple_for_each(ptrs, [](auto &&ptr) { ++ptr; }); }
+template<typename Ttuple> inline void advance (Ttuple &ptrs,
+  const vector<vector<ptrdiff_t>> &str, size_t idim)
+  {
+  tuple_for_each_idx(ptrs, [idim,&str](auto &&ptr, size_t idx)
+                     { ptr += str[idx][idim]; });
+  }
+template<typename Ttuple> inline void advance_by_n (Ttuple &ptrs,
+  const vector<vector<ptrdiff_t>> &str, size_t idim, size_t n)
+  {
+  tuple_for_each_idx(ptrs, [idim,n,&str](auto &&ptr, size_t idx)
+                     { ptr += n*str[idx][idim]; });
+  }
+
+template<typename Ttuple, typename Func>
+  DUCC0_NOINLINE void applyHelper_block(size_t idim, const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, size_t bsi, size_t bsj,
+    const Ttuple &ptrs, Func &&func)
+  {
+  auto leni=shp[idim], lenj=shp[idim+1];
+  size_t nbi = (leni+bsi-1)/bsi;
+  size_t nbj = (lenj+bsj-1)/bsj;
+  for (size_t bi=0; bi<nbi; ++bi)
+    for (size_t bj=0; bj<nbj; ++bj)
+      {
+      auto locptrs(ptrs);
+      advance_by_n(locptrs, str, idim, bi*bsi);
+      advance_by_n(locptrs, str, idim+1, bj*bsj);
+      for (size_t i=bi*bsi; i<min(leni, (bi+1)*bsi); ++i, advance(locptrs, str, idim))
+        {
+        auto locptrs2(locptrs);
+        for (size_t j=bj*bsj; j<min(lenj, (bj+1)*bsj); ++j, advance(locptrs2, str, idim+1))
+          call_with_tuple(func, to_ref(locptrs2));
+        }
+      }
+  }
+
+template<typename Ttuple, typename Func>
+  DUCC0_NOINLINE void applyHelper(size_t idim, const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, size_t block0, size_t block1,
+    const Ttuple &ptrs, Func &&func, bool last_contiguous)
+  {
+  auto len = shp[idim];
+  if ((idim+2==shp.size()) && (block0!=0))  // we should do blocking
+    applyHelper_block(idim, shp, str, block0, block1, ptrs, func);
+  else if (idim+1<shp.size())
+    for (size_t i=0; i<len; ++i)
+      applyHelper(idim+1, shp, str, block0, block1, update_pointers(ptrs, str, idim, i),
+        func, last_contiguous);
+  else
+    {
+    auto locptrs(ptrs);
+    if (last_contiguous)
+      for (size_t i=0; i<len; ++i, advance_contiguous(locptrs))
+        call_with_tuple(func, to_ref(locptrs));
+    else
+      for (size_t i=0; i<len; ++i, advance(locptrs, str, idim))
+        call_with_tuple(func, to_ref(locptrs));
+    }
+  }
+template<typename Func, typename Ttuple>
+  inline void applyHelper(const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, size_t block0, size_t block1,
+    const Ttuple &ptrs, Func &&func, size_t nthreads, bool last_contiguous)
+  {
+  if (shp.size()==0)
+    call_with_tuple(std::forward<Func>(func), to_ref(ptrs));
+  else if (nthreads==1)
+    applyHelper(0, shp, str, block0, block1, ptrs, std::forward<Func>(func), last_contiguous);
+  else
+    execParallel(shp[0], nthreads, [&](size_t lo, size_t hi)
+      {
+      auto locptrs = update_pointers(ptrs, str, 0, lo);
+      auto locshp(shp);
+      locshp[0] = hi-lo;
+      applyHelper(0, locshp, str, block0, block1, locptrs, func, last_contiguous);
+      });
+  }
+
+template<typename Func, typename... Targs>
+  void mav_apply(Func &&func, int nthreads, Targs... args)
+  {
+  vector<fmav_info> infos;
+  (infos.push_back(args), ...);
+  vector<size_t> tsizes;
+  (tsizes.push_back(sizeof(args.data()[0])), ...);
+  auto [shp, str, block0, block1] = multiprep(infos, tsizes);
+  bool last_contiguous = true;
+  if (shp.size()>0)
+    for (const auto &s:str)
+      last_contiguous &= (s.back()==1);
+
+  auto ptrs = tuple_transform(forward_as_tuple(args...),
+    [](auto &&arg){return arg.data();});
+  applyHelper(shp, str, block0, block1, ptrs, std::forward<Func>(func), nthreads, last_contiguous);
+  }
+
+DUCC0_NOINLINE tuple<fmav_info::shape_t, vector<fmav_info::stride_t>>
+  multiprep_noopt(const vector<fmav_info> &info);
+
+template <typename Func, typename Arg, typename Ttuple, size_t... I>
+inline void call_with_tuple_arg_impl(Func &&func, Arg &&arg, const Ttuple& tuple,
+  index_sequence<I...>)
+  { func(std::forward<typename tuple_element<I, Ttuple>::type>(get<I>(tuple))..., arg); }
+template<typename Func, typename Arg, typename Ttuple> inline void call_with_tuple_arg
+  (Func &&func, Arg &&arg, Ttuple &&tuple)
+  {
+  call_with_tuple_arg_impl(std::forward<Func>(func), arg, tuple,
+                       make_index_sequence<tuplelike_size<Ttuple>()>());
+  }
+template<typename Ttuple, typename Func>
+  DUCC0_NOINLINE void applyHelper_with_index(size_t idim, const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, const Ttuple &ptrs, Func &&func,
+    vector<size_t> &index)
+  {
+  auto len = shp[idim];
+  if (idim+1<shp.size())
+    {
+    auto idxbak = index[idim];
+    for (size_t i=0; i<len; ++i, ++index[idim])
+      applyHelper_with_index(idim+1, shp, str, update_pointers(ptrs, str, idim, i),
+        func, index);
+    index[idim] = idxbak;
+    }
+  else
+    {
+    auto locptrs(ptrs);
+    auto idxbak = index[idim];
+    for (size_t i=0; i<len; ++i, ++index[idim], advance(locptrs, str, idim))
+      call_with_tuple_arg(func, const_cast<const vector<size_t> &>(index), to_ref(locptrs));
+    index[idim] = idxbak;
+    }
+  }
+template<typename Func, typename Ttuple>
+  inline void applyHelper_with_index(const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, const Ttuple &ptrs, Func &&func,
+    size_t nthreads, vector<size_t> &index)
+  {
+  if (shp.size()==0)
+    call_with_tuple_arg(std::forward<Func>(func), const_cast<const vector<size_t> &>(index), to_ref(ptrs));
+  else if (nthreads==1)
+    applyHelper_with_index(0, shp, str, ptrs, std::forward<Func>(func), index);
+  else
+    execParallel(shp[0], nthreads, [&](size_t lo, size_t hi)
+      {
+      auto locptrs = update_pointers(ptrs, str, 0, lo);
+      auto locshp(shp);
+      locshp[0] = hi-lo;
+      auto locidx(index);
+      locidx[0]=lo;
+      applyHelper_with_index(0, locshp, str, locptrs, func, locidx);
+      });
+  }
+template<typename Func, typename... Targs>
+  void mav_apply_with_index(Func &&func, int nthreads, Targs... args)
+  {
+  vector<fmav_info> infos;
+  (infos.push_back(args), ...);
+  auto [shp, str] = multiprep_noopt(infos);
+  vector<size_t> index(shp.size(), 0);
+
+  auto ptrs = tuple_transform(forward_as_tuple(args...),
+    [](auto &&arg){return arg.data();});
+  applyHelper_with_index(shp, str, ptrs, std::forward<Func>(func), nthreads, index);
+  }
+
+
+template<typename T, size_t ndim> class mavref
+  {
+  private:
+    const mav_info<ndim> &info;
+    T *d;
+
+  public:
+    using shape_t = typename mav_info<ndim>::shape_t;
+    using stride_t = typename mav_info<ndim>::stride_t;
+    mavref(const mav_info<ndim> &info_, T *d_) : info(info_), d(d_) {}
+    template<typename... Ns> T &operator()(Ns... ns) const
+      { return d[info.idx(ns...)]; }
+    /// Returns the total number of entries in the object.
+    size_t size() const { return info.size(); }
+    /// Returns the shape of the object.
+    const shape_t &shape() const { return info.shape(); }
+    /// Returns the length along dimension \a i.
+    size_t shape(size_t i) const { return info.shape(i); }
+    /// Returns the strides of the object.
+    const stride_t &stride() const { return info.stride(); }
+    /// Returns the stride along dimension \a i.
+    const ptrdiff_t &stride(size_t i) const { return info.stride(i); }
+    /// Returns true iff the last dimension has stride 1.
+    /**  Typically used for optimization purposes. */
+    bool last_contiguous() const
+      { return info.last_contiguous(); }
+    /** Returns true iff the object is C-contiguous, i.e. if the stride of the
+     *  last dimension is 1, the stride for the next-to-last dimension is the
+     *  shape of the last dimension etc. */
+    bool contiguous() const
+      { return info.contiguous(); }
+    /// Returns true iff this->shape and \a other.shape match.
+    bool conformable(const mavref &other) const
+      { return shape()==other.shape(); }
+  };
+
+template<typename T, size_t ndim>
+  mavref<T, ndim> make_mavref(const mav_info<ndim> &info_, T *d_)
+  { return mavref<T, ndim>(info_, d_); }
+
+template<typename...Ts, typename ...Qs, typename Func, size_t... Is>
+inline auto tuple_transform2_impl(const tuple<Ts...> &i1, const tuple<Qs...> &i2,
+  Func &&func, index_sequence<Is...>)
+  { return tuple<result_of_t<Func(Ts, Qs)>...>{func(get<Is>(i1),get<Is>(i2))...}; }
+template<typename... Ts, typename ...Qs, typename Func>
+inline auto tuple_transform2(const tuple<Ts...> &i1, const tuple<Qs...> &i2,
+  Func &&func)
+  {
+  return tuple_transform2_impl(i1, i2, std::forward<Func>(func),
+                               make_index_sequence<sizeof...(Ts)>{});
+  }
+template<typename Tptrs, typename Tinfos>
+  auto make_mavrefs(const Tptrs &ptrs, const Tinfos &infos)
+  {
+  return tuple_transform2(ptrs, infos, [](auto &&ptr, auto &&info)
+    { return make_mavref(info, ptr); });
+  }
+
+template<size_t ndim> auto make_infos(const fmav_info &info)
+  {
+  if constexpr(ndim>0)
+    MR_assert(ndim<=info.ndim(), "bad dimensionality");
+  auto iterdim = info.ndim()-ndim;
+  fmav_info fout({info.shape().begin(),info.shape().begin()+iterdim},
+                 {info.stride().begin(),info.stride().begin()+iterdim});
+
+  typename mav_info<ndim>::shape_t shp;
+  typename mav_info<ndim>::stride_t str;
+  if constexpr (ndim>0)  // just to silence compiler warnings
+    for (size_t i=0; i<ndim; ++i)
+      {
+      shp[i] = info.shape(iterdim+i);
+      str[i] = info.stride(iterdim+i);
+      }
+  mav_info<ndim> iout(shp, str);
+  return make_tuple(fout, iout);
+  }
+
+template<typename Tptrs, typename Tinfos, typename Func>
+  DUCC0_NOINLINE void flexible_mav_applyHelper(size_t idim, const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, const Tptrs &ptrs,
+    const Tinfos &infos, Func &&func)
+  {
+  auto len = shp[idim];
+  auto locptrs(ptrs);
+  if (idim+1<shp.size())
+    for (size_t i=0; i<len; ++i, advance(locptrs, str, idim))
+      flexible_mav_applyHelper(idim+1, shp, str, locptrs, infos, func);
+  else
+    for (size_t i=0; i<len; ++i, advance(locptrs, str, idim))
+      call_with_tuple2(func, make_mavrefs(locptrs, infos));
+  }
+template<typename Tptrs, typename Tinfos, typename Func>
+  DUCC0_NOINLINE void flexible_mav_applyHelper(const vector<size_t> &shp,
+    const vector<vector<ptrdiff_t>> &str, const Tptrs &ptrs,
+    const Tinfos &infos, Func &&func, size_t nthreads)
+  {
+  if (shp.size()==0)
+    call_with_tuple2(func, make_mavrefs(ptrs, infos));
+  else if (nthreads==1)
+    flexible_mav_applyHelper(0, shp, str, ptrs, infos, std::forward<Func>(func));
+  else
+    execParallel(shp[0], nthreads, [&](size_t lo, size_t hi)
+      {
+      auto locptrs = update_pointers(ptrs, str, 0, lo);
+      auto locshp(shp);
+      locshp[0] = hi-lo;
+      flexible_mav_applyHelper(0, locshp, str, locptrs, infos, func);
+      });
+  }
+
+template<size_t ndim> struct Xdim { static constexpr size_t dim=ndim; };
+
+template<typename Ttuple, typename Tdim, typename Func>
+  void xflexible_mav_apply(const Ttuple &tuple, const Tdim &dim, Func &&func, size_t nthreads)
+  {
+  auto fullinfos = tuple_transform2(tuple, dim, [](const auto &arg, const auto &dim)
+                                    { return make_infos<remove_reference_t<decltype(dim)>::dim>(fmav_info(arg)); });
+  vector<fmav_info> iter_infos;
+  tuple_for_each(fullinfos,[&iter_infos](const auto &entry){iter_infos.push_back(get<0>(entry));});
+  auto [shp, str] = multiprep(iter_infos);
+
+  auto infos2 = tuple_transform(fullinfos, [](const auto &arg)
+                                { return get<1>(arg); });
+  auto ptrs = tuple_transform(tuple, [](auto &&arg){return arg.data();});
+  flexible_mav_applyHelper(shp, str, ptrs, infos2, std::forward<Func>(func), nthreads);
+  }
+
+template<size_t nd0, typename T0, typename Func>
+  void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0)
+  {
+  xflexible_mav_apply(forward_as_tuple(m0),
+                      forward_as_tuple(Xdim<nd0>()),
+                      std::forward<Func>(func), nthreads); 
+  }
+
+template<size_t nd0, size_t nd1, typename T0, typename T1, typename Func>
+  void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0, T1 &&m1)
+  {
+  xflexible_mav_apply(forward_as_tuple(m0, m1),
+                      forward_as_tuple(Xdim<nd0>(), Xdim<nd1>()),
+                      std::forward<Func>(func), nthreads); 
+  }
+
+template<size_t nd0, size_t nd1, size_t nd2,
+         typename T0, typename T1, typename T2, typename Func>
+  void flexible_mav_apply(Func &&func, size_t nthreads, T0 &&m0, T1 &&m1, T2 &&m2)
+  {
+  xflexible_mav_apply(forward_as_tuple(m0, m1, m2),
+                      forward_as_tuple(Xdim<nd0>(), Xdim<nd1>(), Xdim<nd2>()),
+                      std::forward<Func>(func), nthreads); 
+  }
+
+}
+
+using detail_mav::UNINITIALIZED;
+using detail_mav::fmav_info;
+using detail_mav::mav_info;
+using detail_mav::slice;
+using detail_mav::MAXIDX;
+using detail_mav::cfmav;
+using detail_mav::vfmav;
+using detail_mav::cmav;
+using detail_mav::vmav;
+using detail_mav::subarray;
+using detail_mav::mav_apply;
+using detail_mav::mav_apply_with_index;
+using detail_mav::flexible_mav_apply;
+}
+
+#endif
diff --git a/contrib/ducc0/infra/misc_utils.h b/contrib/ducc0/infra/misc_utils.h
new file mode 100644
index 000000000..77d3019fa
--- /dev/null
+++ b/contrib/ducc0/infra/misc_utils.h
@@ -0,0 +1,127 @@
+/* Copyright (C) 2019-2021 Max-Planck-Society
+   Author: Martin Reinecke */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_MISC_UTILS_H
+#define DUCC0_MISC_UTILS_H
+
+#include <cstddef>
+#include <tuple>
+#ifdef __GLIBC__
+#include <malloc.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+namespace ducc0 {
+
+namespace detail_misc_utils {
+
+using namespace std;
+
+template<typename T> auto calcShare(size_t nshares, size_t myshare,
+  const T &begin, const T &end)
+  {
+  auto nwork = end-begin;
+  auto nbase = nwork/nshares;
+  auto additional = nwork%nshares;
+  auto lo = begin + (myshare*nbase + ((myshare<additional) ? myshare : additional));
+  auto hi = lo+nbase+(myshare<additional);
+  return make_tuple(lo, hi);
+  }
+
+template<typename T> auto calcShare(size_t nshares, size_t myshare, const T &end)
+  { return calcShare(nshares, myshare, T(0), end); }
+
+template<typename shp> shp noncritical_shape(const shp &in, size_t elemsz)
+  {
+  constexpr size_t critstride = 4096; // must be a power of 2
+  auto ndim = in.size();
+  shp res(in);
+  size_t stride = elemsz;
+  for (size_t i=0, xi=ndim-1; i+1<ndim; ++i, --xi)
+    {
+    size_t tstride = stride*in[xi];
+    if ((tstride&(critstride-1))==0)
+       res[xi] += 3;
+    stride *= res[xi];
+    }
+  return res;
+  }
+
+#ifdef __GLIBC__
+inline bool preallocate_memory(double gbytes)
+  {
+  mallopt(M_MMAP_MAX, 0);  // never do mmap() for memory allocation
+  mallopt(M_TRIM_THRESHOLD, -1);  // never give memory back to OS
+  auto nbytes = size_t(1e9*gbytes);
+  void *blob = malloc(nbytes);
+  if (blob==nullptr) return false;
+  memset(blob, 42, nbytes);
+  // this is just to confuse the compiler sufficiently that it does not
+  // eliminate all the memory operations ...
+  double sum=0;
+  for (size_t iblock=0; iblock<nbytes; iblock+=4096)
+    sum +=((char *)blob)[iblock];
+  sleep(int(sum*1e-300));
+  free(blob);
+  return true;
+  }
+#else
+inline bool preallocate_memory(double /*gbytes*/)
+  { return false; }
+#endif
+
+}
+
+using detail_misc_utils::calcShare;
+using detail_misc_utils::noncritical_shape;
+using detail_misc_utils::preallocate_memory;
+
+}
+
+#endif
diff --git a/contrib/ducc0/infra/simd.h b/contrib/ducc0/infra/simd.h
new file mode 100644
index 000000000..3e4af3d3d
--- /dev/null
+++ b/contrib/ducc0/infra/simd.h
@@ -0,0 +1,864 @@
+/** \file ducc0/infra/simd.h
+ *  Functionality which approximates future standard C++ SIMD classes.
+ *
+ *  For details see section 9 of https://wg21.link/N4808
+ *
+ *  \copyright Copyright (C) 2019-2021 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_SIMD_H
+#define DUCC0_SIMD_H
+
+#if 0 //__has_include(<experimental/simd>)
+#include <cstdint>
+#include <cstdlib>
+#include <cmath>
+#include <algorithm>
+#include <experimental/simd>
+
+namespace ducc0 {
+
+namespace detail_simd {
+
+namespace stdx=std::experimental;
+using stdx::native_simd;
+
+template<typename T, int len> struct simd_select
+  { using type = stdx::simd<T, stdx::simd_abi::deduce_t<T, len>>; };
+
+using stdx::element_aligned_tag;
+template<typename T> constexpr inline bool vectorizable = native_simd<T>::size()>1;
+
+template<typename T, int N> constexpr bool simd_exists_h()
+  {
+  if constexpr (N>1)
+    if constexpr (vectorizable<T>)
+      if constexpr (!std::is_same_v<stdx::simd<T, stdx::simd_abi::deduce_t<T, N>>, stdx::fixed_size_simd<T, N>>)
+        return true;
+  return false;
+  }
+template<typename T, int N> constexpr inline bool simd_exists = simd_exists_h<T,N>();
+
+template<typename Func, typename T, typename Abi> inline stdx::simd<T, Abi> apply(stdx::simd<T, Abi> in, Func func)
+  {
+  stdx::simd<T, Abi> res;
+  for (size_t i=0; i<in.size(); ++i)
+    res[i] = func(in[i]);
+  return res;
+  }
+template<typename T, typename Abi> inline stdx::simd<T,Abi> sin(stdx::simd<T,Abi> in)
+  { return apply(in,[](T v){return sin(v);}); }
+template<typename T, typename Abi> inline stdx::simd<T,Abi> cos(stdx::simd<T,Abi> in)
+  { return apply(in,[](T v){return cos(v);}); }
+
+}
+
+using detail_simd::element_aligned_tag;
+using detail_simd::native_simd;
+using detail_simd::simd_select;
+using detail_simd::simd_exists;
+using detail_simd::vectorizable;
+
+}
+
+#else
+
+// only enable SIMD support for gcc>=5.0 and clang>=5.0
+#ifndef DUCC0_NO_SIMD
+#define DUCC0_NO_SIMD
+#if defined(__clang__)
+// AppleClang has their own version numbering
+#ifdef __apple_build_version__
+#  if (__clang_major__ > 9) || (__clang_major__ == 9 && __clang_minor__ >= 1)
+#     undef DUCC0_NO_SIMD
+#  endif
+#elif __clang_major__ >= 5
+#  undef DUCC0_NO_SIMD
+#endif
+#elif defined(__GNUC__)
+#if __GNUC__>=5
+#undef DUCC0_NO_SIMD
+#endif
+#endif
+#endif
+
+#include <cstddef>
+#include <cmath>
+#include <algorithm>
+
+#ifndef DUCC0_NO_SIMD
+#if defined(__SSE2__)  // we are on an x86 platform and we have vector types
+#include <x86intrin.h>
+#endif
+
+#if defined(__aarch64__)  // let's check for SVE and Neon
+#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS)
+#if __ARM_FEATURE_SVE_BITS>0
+// OK, we can use SVE
+#define DUCC0_USE_SVE
+#include <arm_sve.h>
+#endif
+#endif
+#ifndef DUCC0_USE_SVE
+// see if we can use Neon
+#if defined(__ARM_NEON)
+#define DUCC0_USE_NEON
+#include <arm_neon.h>
+#endif
+#endif
+#endif
+
+#endif
+
+namespace ducc0 {
+
+namespace detail_simd {
+
+/// true iff SIMD support is provided for \a T.
+template<typename T> constexpr inline bool vectorizable = false;
+#if (!defined(DUCC0_NO_SIMD))
+#if defined(__SSE2__) || defined (DUCC0_USE_SVE) || defined (DUCC0_USE_NEON)
+template<> constexpr inline bool vectorizable<float> = true;
+template<> constexpr inline bool vectorizable<double> = true;
+#endif
+#endif
+
+/// true iff a SIMD type with vector length \a len exists for \a T.
+template<typename T, size_t len> constexpr inline bool simd_exists = false;
+
+template<typename T, size_t reglen> constexpr size_t vectorlen
+  = vectorizable<T> ? reglen/sizeof(T) : 1;
+
+template<typename T, size_t len> class helper_;
+template<typename T, size_t len> struct vmask_
+  {
+  private:
+    using hlp = helper_<T, len>;
+    using Tm = typename hlp::Tm;
+    Tm v;
+
+  public:
+#if defined(_MSC_VER)
+    vmask_() {}
+    vmask_(const vmask_ &other) : v(other.v) {}
+    vmask_ &operator=(const vmask_ &other)
+      { v = other.v; return *this; }
+#else
+    vmask_() = default;
+    vmask_(const vmask_ &other) = default;
+    vmask_ &operator=(const vmask_ &other) = default;
+#endif
+    vmask_(Tm v_): v(v_) {}
+    operator Tm() const  { return v; }
+    bool none() const { return hlp::mask_none(v); }
+    bool any() const { return hlp::mask_any(v); }
+    bool all() const { return hlp::mask_all(v); }
+    vmask_ operator& (const vmask_ &other) const { return hlp::mask_and(v,other.v); }
+    vmask_ &operator&= (const vmask_ &other) { v=hlp::mask_and(v,other.v); return *this; }
+    vmask_ operator| (const vmask_ &other) const { return hlp::mask_or(v,other.v); }
+    vmask_ &operator|= (const vmask_ &other) { v=hlp::mask_or(v,other.v); return *this; }
+  };
+struct element_aligned_tag {};
+template<typename T, size_t len> class vtp
+  {
+  private:
+    using hlp = helper_<T, len>;
+
+  public:
+    using value_type = T;
+    using Tv = typename hlp::Tv;
+    using Tm = vmask_<T, len>;
+    static constexpr size_t size() { return len; }
+
+  private:
+    Tv v;
+
+  public:
+#if defined(_MSC_VER)
+    vtp() {}
+    vtp(const vtp &other): v(other.v) {}
+    vtp &operator=(const vtp &other)
+      { v=other.v; return *this; }
+#else
+    vtp() = default;
+    vtp(const vtp &other) = default;
+    vtp &operator=(const vtp &other) = default;
+#endif
+    vtp(T other): vtp(hlp::from_scalar(other)) {}
+    vtp(const Tv &other) : v(other) {}
+    vtp &operator=(const T &other) { v=hlp::from_scalar(other); return *this; }
+    operator Tv() const { return v; }
+
+    vtp(const T *ptr, element_aligned_tag) : v(hlp::loadu(ptr)) {}
+    void copy_to(T *ptr, element_aligned_tag) const { hlp::storeu(ptr, v); }
+
+    vtp operator-() const { return vtp(-v); }
+    vtp operator+(vtp other) const { return vtp(v+other.v); }
+    vtp operator-(vtp other) const { return vtp(v-other.v); }
+    vtp operator*(vtp other) const { return vtp(v*other.v); }
+    vtp operator/(vtp other) const { return vtp(v/other.v); }
+    vtp &operator+=(vtp other) { v+=other.v; return *this; }
+    vtp &operator-=(vtp other) { v-=other.v; return *this; }
+    vtp &operator*=(vtp other) { v*=other.v; return *this; }
+    vtp &operator/=(vtp other) { v/=other.v; return *this; }
+    vtp abs() const { return hlp::abs(v); }
+    inline vtp sqrt() const
+      { return hlp::sqrt(v); }
+    vtp max(const vtp &other) const
+      { return hlp::max(v, other.v); }
+    vtp min(const vtp &other) const
+      { return hlp::min(v, other.v); }
+    Tm operator>(const vtp &other) const
+      { return hlp::gt(v, other.v); }
+    Tm operator>=(const vtp &other) const
+      { return hlp::ge(v, other.v); }
+    Tm operator<(const vtp &other) const
+      { return hlp::lt(v, other.v); }
+    Tm operator<=(const vtp &other) const
+      { return hlp::le(v, other.v); }
+    Tm operator==(const vtp &other) const
+      { return hlp::eq(v, other.v); }
+    Tm operator!=(const vtp &other) const
+      { return hlp::ne(v, other.v); }
+    static vtp blend(Tm mask, const vtp &a, const vtp &b)
+      { return hlp::blend(mask, a, b); }
+
+    class reference
+      {
+      private:
+        vtp &v;
+        size_t i;
+      public:
+        reference (vtp<T, len> &v_, size_t i_)
+          : v(v_), i(i_) {}
+        reference &operator= (T other)
+          { v.v[i] = other; return *this; }
+        reference &operator*= (T other)
+          { v.v[i] *= other; return *this; }
+        operator T() const { return v.v[i]; }
+      };
+
+    void Set(size_t i, T val) { v[i] = val; }
+    reference operator[](size_t i) { return reference(*this, i); }
+    T operator[](size_t i) const { return v[i]; }
+
+    class where_expr
+      {
+      private:
+        vtp &v;
+        Tm m;
+
+      public:
+        where_expr (Tm m_, vtp &v_)
+          : v(v_), m(m_) {}
+        where_expr &operator= (const vtp &other)
+          { v=hlp::blend(m, other.v, v.v); return *this; }
+        where_expr &operator*= (const vtp &other)
+          { v=hlp::blend(m, v.v*other.v, v.v); return *this; }
+        where_expr &operator+= (const vtp &other)
+          { v=hlp::blend(m, v.v+other.v, v.v); return *this; }
+        where_expr &operator-= (const vtp &other)
+          { v=hlp::blend(m, v.v-other.v, v.v); return *this; }
+      };
+  };
+template<typename T, size_t len> inline vtp<T, len> abs(vtp<T, len> v) { return v.abs(); }
+template<typename T, size_t len> typename vtp<T, len>::where_expr where(typename vtp<T, len>::Tm m, vtp<T, len> &v)
+  { return typename vtp<T, len>::where_expr(m, v); }
+template<typename T0, typename T, size_t len> vtp<T, len> operator*(T0 a, vtp<T, len> b)
+  { return b*a; }
+template<typename T, size_t len> vtp<T, len> operator+(T a, vtp<T, len> b)
+  { return b+a; }
+template<typename T, size_t len> vtp<T, len> operator-(T a, vtp<T, len> b)
+  { return vtp<T, len>(a) - b; }
+template<typename T, size_t len> vtp<T, len> max(vtp<T, len> a, vtp<T, len> b)
+  { return a.max(b); }
+template<typename T, size_t len> vtp<T, len> min(vtp<T, len> a, vtp<T, len> b)
+  { return a.min(b); }
+template<typename T, size_t len> vtp<T, len> sqrt(vtp<T, len> v)
+  { return v.sqrt(); }
+template<typename T, size_t len> inline bool none_of(const vmask_<T, len> &mask)
+  { return mask.none(); }
+template<typename T, size_t len> inline bool any_of(const vmask_<T, len> &mask)
+  { return mask.any(); }
+template<typename T, size_t len> inline bool all_of(const vmask_<T, len> &mask)
+  { return mask.all(); }
+template<typename T, size_t len> inline vtp<T,len> blend (const vmask_<T, len> &mask, const vtp<T,len> &a, const vtp<T,len> &b)
+  { return vtp<T,len>::blend(mask, a, b); }
+template<typename Op, typename T, size_t len> T reduce(const vtp<T, len> &v, Op op)
+  {
+  T res=v[0];
+  for (size_t i=1; i<len; ++i)
+    res = op(res, v[i]);
+  return res;
+  }
+template<typename Func, typename T, size_t vlen> vtp<T, vlen> apply(vtp<T, vlen> in, Func func)
+  {
+  vtp<T, vlen> res;
+  for (size_t i=0; i<in.size(); ++i)
+    res[i] = func(in[i]);
+  return res;
+  }
+template<typename T> class pseudoscalar
+  {
+  private:
+    T v;
+
+  public:
+#if defined(_MSC_VER)
+    pseudoscalar() {}
+    pseudoscalar(const pseudoscalar &other) : v(other.v) {}
+    pseudoscalar & operator=(const pseudoscalar &other)
+      { v=other.v; return *this; }
+#else
+    pseudoscalar() = default;
+    pseudoscalar(const pseudoscalar &other) = default;
+    pseudoscalar & operator=(const pseudoscalar &other) = default;
+#endif
+    pseudoscalar(T v_):v(v_) {}
+    pseudoscalar operator-() const { return pseudoscalar(-v); }
+    pseudoscalar operator+(pseudoscalar other) const { return pseudoscalar(v+other.v); }
+    pseudoscalar operator-(pseudoscalar other) const { return pseudoscalar(v-other.v); }
+    pseudoscalar operator*(pseudoscalar other) const { return pseudoscalar(v*other.v); }
+    pseudoscalar operator/(pseudoscalar other) const { return pseudoscalar(v/other.v); }
+    pseudoscalar &operator+=(pseudoscalar other) { v+=other.v; return *this; }
+    pseudoscalar &operator-=(pseudoscalar other) { v-=other.v; return *this; }
+    pseudoscalar &operator*=(pseudoscalar other) { v*=other.v; return *this; }
+    pseudoscalar &operator/=(pseudoscalar other) { v/=other.v; return *this; }
+
+    pseudoscalar abs() const { return std::abs(v); }
+    inline pseudoscalar sqrt() const { return std::sqrt(v); }
+    pseudoscalar max(const pseudoscalar &other) const
+      { return std::max(v, other.v); }
+    pseudoscalar min(const pseudoscalar &other) const
+      { return std::min(v, other.v); }
+
+    bool operator>(const pseudoscalar &other) const
+      { return v>other.v; }
+    bool operator>=(const pseudoscalar &other) const
+      { return v>=other.v; }
+    bool operator<(const pseudoscalar &other) const
+      { return v<other.v; }
+    bool operator<=(const pseudoscalar &other) const
+      { return v<=other.v; }
+    bool operator==(const pseudoscalar &other) const
+      { return v==other.v; }
+    bool operator!=(const pseudoscalar &other) const
+      { return v!=other.v; }
+    const T &operator[] (size_t /*i*/) const { return v; }
+    T &operator[](size_t /*i*/) { return v; }
+  };
+
+template<typename T> class helper_<T,1>
+  {
+  private:
+    static constexpr size_t len = 1;
+  public:
+    using Tv = pseudoscalar<T>;
+    using Tm = bool;
+
+    static Tv loadu(const T *ptr) { return *ptr; }
+    static void storeu(T *ptr, Tv v) { *ptr = v[0]; }
+
+    static Tv from_scalar(T v) { return v; }
+    static Tv abs(Tv v) { return v.abs(); }
+    static Tv max(Tv v1, Tv v2) { return v1.max(v2); }
+    static Tv min(Tv v1, Tv v2) { return v1.min(v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return m ? v1 : v2; }
+    static Tv sqrt(Tv v) { return v.sqrt(); }
+    static Tm gt (Tv v1, Tv v2) { return v1>v2; }
+    static Tm ge (Tv v1, Tv v2) { return v1>=v2; }
+    static Tm lt (Tv v1, Tv v2) { return v1<v2; }
+    static Tm le (Tv v1, Tv v2) { return v1<=v2; }
+    static Tm eq (Tv v1, Tv v2) { return v1==v2; }
+    static Tm ne (Tv v1, Tv v2) { return v1!=v2; }
+    static Tm mask_and (Tm v1, Tm v2) { return v1&&v2; }
+    static Tm mask_or (Tm v1, Tm v2) { return v1||v2; }
+    static size_t maskbits(Tm v) { return v; }
+    static bool mask_none(Tm v) { return !v; }
+    static bool mask_any(Tm v) { return v; }
+    static bool mask_all(Tm v) { return v; }
+  };
+
+#ifndef DUCC0_NO_SIMD
+
+#if defined(__AVX512F__)
+template<> constexpr inline bool simd_exists<double,8> = true;
+template<> class helper_<double,8>
+  {
+  private:
+    using T = double;
+    static constexpr size_t len = 8;
+  public:
+    using Tv = __m512d;
+    using Tm = __mmask8;
+
+    static Tv loadu(const T *ptr) { return _mm512_loadu_pd(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm512_storeu_pd(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm512_set1_pd(v); }
+    static Tv abs(Tv v) { return __m512d(_mm512_andnot_epi64(__m512i(_mm512_set1_pd(-0.)),__m512i(v))); }
+    static Tv max(Tv v1, Tv v2) { return _mm512_max_pd(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm512_min_pd(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_pd(m, v2, v1); }
+    static Tv sqrt(Tv v) { return _mm512_sqrt_pd(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GT_OQ); }
+    static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_GE_OQ); }
+    static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LT_OQ); }
+    static Tm le (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_LE_OQ); }
+    static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_EQ_OQ); }
+    static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_pd_mask(v1,v2,_CMP_NEQ_OQ); }
+    static Tm mask_and (Tm v1, Tm v2) { return v1&v2; }
+    static Tm mask_or (Tm v1, Tm v2) { return v1|v2; }
+    static bool mask_none(Tm v) { return v==0; }
+    static bool mask_any(Tm v) { return v!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = Tm((size_t(1)<<len)-1);
+      return v==fullmask;
+      }
+  };
+template<> constexpr inline bool simd_exists<float,16> = true;
+template<> class helper_<float,16>
+  {
+  private:
+    using T = float;
+    static constexpr size_t len = 16;
+  public:
+    using Tv = __m512;
+    using Tm = __mmask16;
+
+    static Tv loadu(const T *ptr) { return _mm512_loadu_ps(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm512_storeu_ps(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm512_set1_ps(v); }
+    static Tv abs(Tv v) { return __m512(_mm512_andnot_epi32(__m512i(_mm512_set1_ps(-0.)),__m512i(v))); }
+    static Tv max(Tv v1, Tv v2) { return _mm512_max_ps(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm512_min_ps(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm512_mask_blend_ps(m, v2, v1); }
+    static Tv sqrt(Tv v) { return _mm512_sqrt_ps(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GT_OQ); }
+    static Tm ge (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_GE_OQ); }
+    static Tm lt (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LT_OQ); }
+    static Tm le (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_LE_OQ); }
+    static Tm eq (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_EQ_OQ); }
+    static Tm ne (Tv v1, Tv v2) { return _mm512_cmp_ps_mask(v1,v2,_CMP_NEQ_OQ); }
+    static Tm mask_and (Tm v1, Tm v2) { return v1&v2; }
+    static Tm mask_or (Tm v1, Tm v2) { return v1|v2; }
+    static bool mask_none(Tm v) { return v==0; }
+    static bool mask_any(Tm v) { return v!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = Tm((size_t(1)<<len)-1);
+      return v==fullmask;
+      }
+  };
+#endif
+#if defined(__AVX__)
+template<> constexpr inline bool simd_exists<double,4> = true;
+template<> class helper_<double,4>
+  {
+  private:
+    using T = double;
+    static constexpr size_t len = 4;
+  public:
+    using Tv = __m256d;
+    using Tm = __m256d;
+
+    static Tv loadu(const T *ptr) { return _mm256_loadu_pd(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm256_storeu_pd(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm256_set1_pd(v); }
+    static Tv abs(Tv v) { return _mm256_andnot_pd(_mm256_set1_pd(-0.),v); }
+    static Tv max(Tv v1, Tv v2) { return _mm256_max_pd(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm256_min_pd(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_pd(v2, v1, m); }
+    static Tv sqrt(Tv v) { return _mm256_sqrt_pd(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GT_OQ); }
+    static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_GE_OQ); }
+    static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LT_OQ); }
+    static Tm le (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_LE_OQ); }
+    static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_EQ_OQ); }
+    static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_pd(v1,v2,_CMP_NEQ_OQ); }
+    static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_pd(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_pd(v1,v2); }
+    static size_t maskbits(Tm v) { return size_t(_mm256_movemask_pd(v)); }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+template<> constexpr inline bool simd_exists<float,8> = true;
+template<> class helper_<float,8>
+  {
+  private:
+    using T = float;
+    static constexpr size_t len = 8;
+  public:
+    using Tv = __m256;
+    using Tm = __m256;
+
+    static Tv loadu(const T *ptr) { return _mm256_loadu_ps(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm256_storeu_ps(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm256_set1_ps(v); }
+    static Tv abs(Tv v) { return _mm256_andnot_ps(_mm256_set1_ps(-0.),v); }
+    static Tv max(Tv v1, Tv v2) { return _mm256_max_ps(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm256_min_ps(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return _mm256_blendv_ps(v2, v1, m); }
+    static Tv sqrt(Tv v) { return _mm256_sqrt_ps(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GT_OQ); }
+    static Tm ge (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_GE_OQ); }
+    static Tm lt (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LT_OQ); }
+    static Tm le (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_LE_OQ); }
+    static Tm eq (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_EQ_OQ); }
+    static Tm ne (Tv v1, Tv v2) { return _mm256_cmp_ps(v1,v2,_CMP_NEQ_OQ); }
+    static Tm mask_and (Tm v1, Tm v2) { return _mm256_and_ps(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return _mm256_or_ps(v1,v2); }
+    static size_t maskbits(Tm v) { return size_t(_mm256_movemask_ps(v)); }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+#endif
+#if defined(__SSE2__)
+template<> constexpr inline bool simd_exists<double,2> = true;
+template<> class helper_<double,2>
+  {
+  private:
+    using T = double;
+    static constexpr size_t len = 2;
+  public:
+    using Tv = __m128d;
+    using Tm = __m128d;
+
+    static Tv loadu(const T *ptr) { return _mm_loadu_pd(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm_storeu_pd(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm_set1_pd(v); }
+    static Tv abs(Tv v) { return _mm_andnot_pd(_mm_set1_pd(-0.),v); }
+    static Tv max(Tv v1, Tv v2) { return _mm_max_pd(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm_min_pd(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2)
+      {
+#if defined(__SSE4_1__)
+      return _mm_blendv_pd(v2,v1,m);
+#else
+      return _mm_or_pd(_mm_and_pd(m,v1),_mm_andnot_pd(m,v2));
+#endif
+      }
+    static Tv sqrt(Tv v) { return _mm_sqrt_pd(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_pd(v1,v2); }
+    static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_pd(v1,v2); }
+    static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_pd(v1,v2); }
+    static Tm le (Tv v1, Tv v2) { return _mm_cmple_pd(v1,v2); }
+    static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_pd(v1,v2); }
+    static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_pd(v1,v2); }
+    static Tm mask_and (Tm v1, Tm v2) { return _mm_and_pd(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return _mm_or_pd(v1,v2); }
+    static size_t maskbits(Tm v) { return size_t(_mm_movemask_pd(v)); }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+template<> constexpr inline bool simd_exists<float,4> = true;
+template<> class helper_<float,4>
+  {
+  private:
+    using T = float;
+    static constexpr size_t len = 4;
+  public:
+    using Tv = __m128;
+    using Tm = __m128;
+
+    static Tv loadu(const T *ptr) { return _mm_loadu_ps(ptr); }
+    static void storeu(T *ptr, Tv v) { _mm_storeu_ps(ptr, v); }
+
+    static Tv from_scalar(T v) { return _mm_set1_ps(v); }
+    static Tv abs(Tv v) { return _mm_andnot_ps(_mm_set1_ps(-0.),v); }
+    static Tv max(Tv v1, Tv v2) { return _mm_max_ps(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return _mm_min_ps(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2)
+      {
+#if defined(__SSE4_1__)
+      return _mm_blendv_ps(v2,v1,m);
+#else
+      return _mm_or_ps(_mm_and_ps(m,v1),_mm_andnot_ps(m,v2));
+#endif
+      }
+    static Tv sqrt(Tv v) { return _mm_sqrt_ps(v); }
+    static Tm gt (Tv v1, Tv v2) { return _mm_cmpgt_ps(v1,v2); }
+    static Tm ge (Tv v1, Tv v2) { return _mm_cmpge_ps(v1,v2); }
+    static Tm lt (Tv v1, Tv v2) { return _mm_cmplt_ps(v1,v2); }
+    static Tm le (Tv v1, Tv v2) { return _mm_cmple_ps(v1,v2); }
+    static Tm eq (Tv v1, Tv v2) { return _mm_cmpeq_ps(v1,v2); }
+    static Tm ne (Tv v1, Tv v2) { return _mm_cmpneq_ps(v1,v2); }
+    static Tm mask_and (Tm v1, Tm v2) { return _mm_and_ps(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return _mm_or_ps(v1,v2); }
+    static size_t maskbits(Tm v) { return size_t(_mm_movemask_ps(v)); }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+#endif
+
+#if defined(DUCC0_USE_SVE)
+template<typename T, size_t len> class gnuvec_helper
+  {
+  public:
+    using Tv __attribute__ ((vector_size (len*sizeof(T)))) = T;
+    using Tm = decltype(Tv()<Tv());
+
+    static Tv loadu(const T *ptr)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = ptr[i];
+      return res;
+      }
+    static void storeu(T *ptr, Tv v)
+      { for (size_t i=0; i<len; ++i) ptr[i] = v[i]; }
+
+    static Tv from_scalar(T v)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = v;
+      return res;
+      }
+    static Tv abs(Tv v)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = std::abs(v[i]);
+      return res;
+      }
+    static Tv max(Tv v1, Tv v2)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = std::max(v1[i], v2[i]);
+      return res;
+      }
+    static Tv min(Tv v1, Tv v2)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = std::min(v1[i], v2[i]);
+      return res;
+      }
+    static Tv blend(Tm m, Tv v1, Tv v2)
+      { return m ? v1 : v2; }
+    static Tv sqrt(Tv v)
+      {
+      Tv res;
+      for (size_t i=0; i<len; ++i) res[i] = std::sqrt(v[i]);
+      return res;
+      }
+    static Tm gt (Tv v1, Tv v2) { return v1>v2; }
+    static Tm ge (Tv v1, Tv v2) { return v1>=v2; }
+    static Tm lt (Tv v1, Tv v2) { return v1<v2; }
+    static Tm le (Tv v1, Tv v2) { return v1<=v2; }
+    static Tm eq (Tv v1, Tv v2) { return v1==v2; }
+    static Tm ne (Tv v1, Tv v2) { return v1!=v2; }
+    static Tm mask_and (Tm v1, Tm v2) { return v1&&v2; }
+    static Tm mask_or (Tm v1, Tm v2) { return v1||v2; }
+    static size_t maskbits(Tm v)
+      {
+      size_t res=0;
+      for (size_t i=0; i<len; ++i) res += (v[i]!=0)<<i;
+      return res;
+      }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+template<> constexpr inline bool simd_exists<double,__ARM_FEATURE_SVE_BITS/64> = true;
+template<> class helper_<double,__ARM_FEATURE_SVE_BITS/64>: public gnuvec_helper<double, __ARM_FEATURE_SVE_BITS/64> {};
+template<> constexpr inline bool simd_exists<float,__ARM_FEATURE_SVE_BITS/32> = true;
+template<> class helper_<float,__ARM_FEATURE_SVE_BITS/32>: public gnuvec_helper<float, __ARM_FEATURE_SVE_BITS/32> {};
+#endif
+
+#if defined(DUCC0_USE_NEON)
+template<> constexpr inline bool simd_exists<double,2> = true;
+template<> class helper_<double,2>
+  {
+  private:
+    using T = double;
+    static constexpr size_t len = 2;
+  public:
+    using Tv = float64x2_t;
+    using Tm = uint64x2_t;
+
+    static Tv loadu(const T *ptr) { return vld1q_f64(ptr); }
+    static void storeu(T *ptr, Tv v) { vst1q_f64(ptr, v); }
+
+    static Tv from_scalar(T v) { return vdupq_n_f64(v); }
+    static Tv abs(Tv v) { return vabsq_f64(v); }
+    static Tv max(Tv v1, Tv v2) { return vmaxq_f64(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return vminq_f64(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2)
+      { return vbslq_f64(m, v1, v2); }
+    static Tv sqrt(Tv v) { return vsqrtq_f64(v); }
+    static Tm gt (Tv v1, Tv v2) { return vcgtq_f64(v1,v2); }
+    static Tm ge (Tv v1, Tv v2) { return vcgeq_f64(v1,v2); }
+    static Tm lt (Tv v1, Tv v2) { return vcltq_f64(v1,v2); }
+    static Tm le (Tv v1, Tv v2) { return vcleq_f64(v1,v2); }
+    static Tm eq (Tv v1, Tv v2) { return vceqq_f64(v1,v2); }
+    static Tm ne (Tv v1, Tv v2)
+      { return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(v1,v2)))); }
+    static Tm mask_and (Tm v1, Tm v2) { return vandq_u64(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return vorrq_u64(v1,v2); }
+    static size_t maskbits(Tm v)
+      {
+      auto high_bits = vshrq_n_u64(v, 63);
+      return vgetq_lane_u64(high_bits, 0) | ((vgetq_lane_u64(high_bits, 1)<<1));
+      }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+
+template<> constexpr inline bool simd_exists<float,4> = true;
+template<> class helper_<float,4>
+  {
+  private:
+    using T = float;
+    static constexpr size_t len = 4;
+  public:
+    using Tv = float32x4_t;
+    using Tm = uint32x4_t;
+
+    static Tv loadu(const T *ptr) { return vld1q_f32(ptr); }
+    static void storeu(T *ptr, Tv v) { vst1q_f32(ptr, v); }
+
+    static Tv from_scalar(T v) { return vdupq_n_f32(v); }
+    static Tv abs(Tv v) { return vabsq_f32(v); }
+    static Tv max(Tv v1, Tv v2) { return vmaxq_f32(v1, v2); }
+    static Tv min(Tv v1, Tv v2) { return vminq_f32(v1, v2); }
+    static Tv blend(Tm m, Tv v1, Tv v2) { return vbslq_f32(m, v1, v2); }
+    static Tv sqrt(Tv v) { return vsqrtq_f32(v); }
+    static Tm gt (Tv v1, Tv v2) { return vcgtq_f32(v1,v2); }
+    static Tm ge (Tv v1, Tv v2) { return vcgeq_f32(v1,v2); }
+    static Tm lt (Tv v1, Tv v2) { return vcltq_f32(v1,v2); }
+    static Tm le (Tv v1, Tv v2) { return vcleq_f32(v1,v2); }
+    static Tm eq (Tv v1, Tv v2) { return vceqq_f32(v1,v2); }
+    static Tm ne (Tv v1, Tv v2) { return vmvnq_u32(vceqq_f32(v1,v2)); }
+    static Tm mask_and (Tm v1, Tm v2) { return vandq_u32(v1,v2); }
+    static Tm mask_or (Tm v1, Tm v2) { return vorrq_u32(v1,v2); }
+    static size_t maskbits(Tm v)
+      {
+      static constexpr int32x4_t shift = {0, 1, 2, 3};
+      auto tmp = vshrq_n_u32(v, 31);
+      return vaddvq_u32(vshlq_u32(tmp, shift));
+      }
+    static bool mask_none(Tm v) { return maskbits(v)==0; }
+    static bool mask_any(Tm v) { return maskbits(v)!=0; }
+    static bool mask_all(Tm v)
+      {
+      static constexpr auto fullmask = (size_t(1)<<len)-1;
+      return maskbits(v)==fullmask;
+      }
+  };
+#endif
+
+#if defined(__AVX512F__)
+template<typename T> using native_simd = vtp<T,vectorlen<T,64>>;
+#elif defined(__AVX__)
+template<typename T> using native_simd = vtp<T,vectorlen<T,32>>;
+#elif defined(__SSE2__)
+template<typename T> using native_simd = vtp<T,vectorlen<T,16>>;
+#elif defined(DUCC0_USE_SVE)
+template<typename T> using native_simd = vtp<T,vectorlen<T,__ARM_FEATURE_SVE_BITS/8>>;
+#elif defined(DUCC0_USE_NEON)
+template<typename T> using native_simd = vtp<T,vectorlen<T,16>>;
+#else
+template<typename T> using native_simd = vtp<T,1>;
+#endif
+
+#else // DUCC0_NO_SIMD is defined
+/// The SIMD type for \a T with the largest vector length on this platform.
+template<typename T> using native_simd = vtp<T,1>;
+#endif
+/// Provides a SIMD type for \a T with vector length \a len, if it exists.
+template<typename T, int len> struct simd_select
+  { using type = vtp<T, len>; };
+template<typename T, size_t len> inline vtp<T,len> sin(vtp<T,len> in)
+  { return apply(in,[](T v){return std::sin(v);}); }
+template<typename T, size_t len> inline vtp<T,len> cos(vtp<T,len> in)
+  { return apply(in,[](T v){return std::cos(v);}); }
+
+}
+
+using detail_simd::element_aligned_tag;
+using detail_simd::native_simd;
+using detail_simd::simd_select;
+using detail_simd::simd_exists;
+using detail_simd::vectorizable;
+
+}
+#endif
+#endif
diff --git a/contrib/ducc0/infra/string_utils.cc b/contrib/ducc0/infra/string_utils.cc
new file mode 100644
index 000000000..652d91b3b
--- /dev/null
+++ b/contrib/ducc0/infra/string_utils.cc
@@ -0,0 +1,223 @@
+/*
+ *  This file is part of libcxxsupport.
+ *
+ *  libcxxsupport is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  libcxxsupport is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with libcxxsupport; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/*
+ *  libcxxsupport is being developed at the Max-Planck-Institut fuer Astrophysik
+ *  and financially supported by the Deutsches Zentrum fuer Luft- und Raumfahrt
+ *  (DLR).
+ */
+
+/*
+ *  This file contains the implementation of various convenience functions
+ *  used by the Planck LevelS package.
+ *
+ *  Copyright (C) 2002-2021 Max-Planck-Society
+ *  Author: Martin Reinecke
+ */
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+#include <iomanip>
+#include <string>
+#include <cstring>
+#include <cctype>
+#include "ducc0/infra/string_utils.h"
+#include "ducc0/infra/error_handling.h"
+
+namespace ducc0 {
+
+namespace detail_string_utils {
+
+using namespace std;
+
+string trim (const string &orig)
+  {
+  string::size_type p1=orig.find_first_not_of(" \t");
+  if (p1==string::npos) return "";
+  string::size_type p2=orig.find_last_not_of(" \t");
+  return orig.substr(p1,p2-p1+1);
+  }
+
+template<typename T> string dataToString (const T &x)
+  {
+  ostringstream strstrm;
+  strstrm << x;
+  return trim(strstrm.str());
+  }
+
+template<> string dataToString (const bool &x)
+  { return x ? "T" : "F"; }
+template<> string dataToString (const string &x)
+  { return trim(x); }
+template<> string dataToString (const float &x)
+  {
+  ostringstream strstrm;
+  strstrm << setprecision(8) << x;
+  return trim(strstrm.str());
+  }
+template<> string dataToString (const double &x)
+  {
+  ostringstream strstrm;
+  strstrm << setprecision(16) << x;
+  return trim(strstrm.str());
+  }
+template<> string dataToString (const long double &x)
+  {
+  ostringstream strstrm;
+  strstrm << setprecision(25) << x;
+  return trim(strstrm.str());
+  }
+
+template string dataToString (const signed char &x);
+template string dataToString (const unsigned char &x);
+template string dataToString (const short &x);
+template string dataToString (const unsigned short &x);
+template string dataToString (const int &x);
+template string dataToString (const unsigned int &x);
+template string dataToString (const long &x);
+template string dataToString (const unsigned long &x);
+template string dataToString (const long long &x);
+template string dataToString (const unsigned long long &x);
+
+string intToString(int64_t x, size_t width)
+  {
+  ostringstream strstrm;
+  (x>=0) ? strstrm << setw(width) << setfill('0') << x
+         : strstrm << "-" << setw(width-1) << setfill('0') << -x;
+  string res = strstrm.str();
+  MR_assert(res.size()==width,"number too large");
+  return trim(res);
+  }
+
+template<typename T> T stringToData (const string &x)
+  {
+  istringstream strstrm(x);
+  T value;
+  strstrm >> value;
+  bool ok = bool(strstrm);
+  if (ok)
+    {
+    string rest;
+    strstrm >> rest;
+    ok = rest.length()==0;
+    }
+  MR_assert(ok, "could not convert '", x, "' to desired data type.");
+  return value;
+  }
+
+template<> string stringToData (const string &x)
+  { return trim(x); }
+
+template<> bool stringToData (const string &x)
+  {
+  const char *fval[] = {"f","n","false",".false."};
+  const char *tval[] = {"t","y","true",".true."};
+  for (size_t i=0; i< sizeof(fval)/sizeof(fval[0]); ++i)
+    if (equal_nocase(x,fval[i])) return false;
+  for (size_t i=0; i< sizeof(tval)/sizeof(tval[0]); ++i)
+    if (equal_nocase(x,tval[i])) return true;
+  MR_fail("conversion error in stringToData<bool>(",x,")");
+  }
+
+template signed char stringToData (const string &x);
+template unsigned char stringToData (const string &x);
+template short stringToData (const string &x);
+template unsigned short stringToData (const string &x);
+template int  stringToData (const string &x);
+template unsigned int stringToData (const string &x);
+template long stringToData (const string &x);
+template unsigned long stringToData (const string &x);
+template long long stringToData (const string &x);
+template unsigned long long stringToData (const string &x);
+template float stringToData (const string &x);
+template double stringToData (const string &x);
+template long double stringToData (const string &x);
+
+bool equal_nocase (const string &a, const string &b)
+  {
+  if (a.size()!=b.size()) return false;
+  for (size_t m=0; m<a.size(); ++m)
+    if (std::tolower(a[m])!=std::tolower(b[m])) return false;
+  return true;
+  }
+
+string tolower(const string &input)
+  {
+  string result=input;
+  for (size_t m=0; m<result.size(); ++m)
+    result[m]=char(std::tolower(result[m]));
+  return result;
+  }
+
+namespace {
+
+template<typename T> vector<T> split (istream &stream)
+  {
+  vector<T> list;
+  while (stream)
+    {
+    string word;
+    stream >> word;
+    MR_assert (stream||stream.eof(),
+      "error while splitting stream into components");
+    if (stream) list.push_back(stringToData<T>(word));
+    }
+  return list;
+  }
+
+} // unnamed namespace
+
+template<typename T> vector<T> split (const string &inp)
+  {
+  istringstream is(inp);
+  return split<T>(is);
+  }
+
+template vector<string> split (const string &inp);
+template vector<float> split (const string &inp);
+template vector<double> split (const string &inp);
+template vector<int> split (const string &inp);
+template vector<long> split (const string &inp);
+
+vector<string> tokenize (const string &inp, char delim)
+  {
+  istringstream stream(inp);
+  string token;
+  vector<string> list;
+  while (getline(stream,token,delim))
+    list.push_back(token);
+  return list;
+  }
+
+vector<string> parse_words_from_file (const string &filename)
+  {
+  vector<string> words;
+  ifstream inp(filename.c_str());
+  MR_assert (inp,"Could not open file '", filename, "'.");
+  while (inp)
+    {
+    string word;
+    inp>>word;
+    word=trim(word);
+    if (word!="") words.push_back(word);
+    }
+  return words;
+  }
+
+}}
diff --git a/contrib/ducc0/infra/string_utils.h b/contrib/ducc0/infra/string_utils.h
new file mode 100644
index 000000000..e95ef4672
--- /dev/null
+++ b/contrib/ducc0/infra/string_utils.h
@@ -0,0 +1,99 @@
+/*
+ *  This file is part of the MR utility library.
+ *
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/** \file ducc0/infra/string_utils.h
+ *
+ *  \copyright Copyright (C) 2019-2021 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+#ifndef DUCC0_STRING_UTILS_H
+#define DUCC0_STRING_UTILS_H
+
+// FIXME: most of this will be superseded by C++20 std::format
+
+#include <string>
+#include <vector>
+#include <cstdint>
+#include <cstddef>
+
+namespace ducc0 {
+
+namespace detail_string_utils {
+
+/*! \defgroup stringutilsgroup String handling helper functions */
+/*! \{ */
+
+/// Returns the string \a orig without leading and trailing whitespace.
+std::string trim (const std::string &orig);
+
+/// Returns a string containing the text representation of \a x.
+/*! Care is taken that no information is lost in the conversion. */
+template<typename T> std::string dataToString(const T &x);
+template<> std::string dataToString (const bool &x);
+template<> std::string dataToString (const std::string &x);
+template<> std::string dataToString (const float &x);
+template<> std::string dataToString (const double &x);
+template<> std::string dataToString (const long double &x);
+
+/// Returns a string containing the text representation of \a x, padded
+/// with leading zeroes to \a width characters.
+std::string intToString(std::int64_t x, std::size_t width);
+
+/// Reads a value of a given datatype from a string.
+template<typename T> T stringToData (const std::string &x);
+template<> std::string stringToData (const std::string &x);
+template<> bool stringToData (const std::string &x);
+
+/// Case-insensitive string comparison
+/*! Returns \a true, if \a a and \a b differ only in capitalisation,
+    else \a false. */
+bool equal_nocase (const std::string &a, const std::string &b);
+
+/// Returns lowercase version of \a input.
+std::string tolower(const std::string &input);
+
+/// Tries to split \a inp into a white-space separated list of values of
+/// type \a T, and appends them to \a list.
+template<typename T> inline std::vector<T> split (const std::string &inp);
+
+/// Breaks the string \a inp into tokens separated by \a delim, and returns them
+/// as a vector<string>.
+std::vector<std::string> tokenize (const std::string &inp, char delim);
+
+/// Breaks the contents of file \a filename into tokens separated by white
+/// space, and returns them as a vector<string>.
+std::vector<std::string> parse_words_from_file (const std::string &filename);
+
+/*! \} */
+
+}
+
+using detail_string_utils::trim;
+//using detail_string_utils::intToString;
+using detail_string_utils::dataToString;
+using detail_string_utils::stringToData;
+using detail_string_utils::equal_nocase;
+//using detail_string_utils::tolower;
+//using detail_string_utils::split;
+//using detail_string_utils::tokenize;
+//using detail_string_utils::parse_words_from_file;
+
+}
+
+#endif
diff --git a/contrib/ducc0/infra/threading.cc b/contrib/ducc0/infra/threading.cc
new file mode 100644
index 000000000..77c04b50d
--- /dev/null
+++ b/contrib/ducc0/infra/threading.cc
@@ -0,0 +1,759 @@
+/** \file ducc0/infra/threading.cc
+ *
+ *  \copyright Copyright (C) 2019-2023 Peter Bell, Max-Planck-Society
+ *  \authors Peter Bell, Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "ducc0/infra/threading.h"
+#include "ducc0/infra/error_handling.h"
+#include "ducc0/infra/misc_utils.h"
+#include "ducc0/infra/string_utils.h"
+#include <atomic>
+#include <exception>
+#include <utility>
+
+#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING
+#include <algorithm>
+#include <stdexcept>
+#include <cstdlib>
+#include <thread>
+#include <queue>
+#include <vector>
+#include <errno.h>
+#include <string>
+#include <string.h>
+#if __has_include(<pthread.h>)
+#include <pthread.h>
+#if __has_include(<pthread.h>) && defined(__linux__) && defined(_GNU_SOURCE)
+#include <unistd.h>
+#endif
+#endif
+#endif
+
+namespace ducc0 {
+
+namespace detail_threading {
+
+class latch
+  {
+    std::atomic<size_t> num_left_;
+    Mutex mut_;
+    CondVar completed_;
+    using lock_t = UniqueLock;
+
+  public:
+    latch(size_t n): num_left_(n) {}
+
+    void count_down()
+      {
+      lock_t lock(mut_);
+      if (--num_left_)
+        return;
+      completed_.notify_all();
+      }
+
+    void wait()
+      {
+      lock_t lock(mut_);
+      completed_.wait(lock, [this]{ return is_ready(); });
+      }
+    bool is_ready() { return num_left_ == 0; }
+  };
+
+#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING
+
+size_t ducc0_max_threads()
+  {
+  static const size_t max_threads_ = []()
+    {
+#if __has_include(<pthread.h>) && defined(__linux__) && defined(_GNU_SOURCE)
+    cpu_set_t cpuset;
+    CPU_ZERO(&cpuset);
+    pthread_getaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+    size_t res=0;
+    for (size_t i=0; i<CPU_SETSIZE; ++i)
+      if (CPU_ISSET(i, &cpuset)) ++res;
+#else
+    size_t res = std::max<size_t>(1, std::thread::hardware_concurrency());
+#endif
+    auto evar=getenv("DUCC0_NUM_THREADS");
+    // fallback
+    if (!evar)
+      evar=getenv("OMP_NUM_THREADS");
+    if (!evar)
+      return res;
+    auto res2 = stringToData<long>(trim(std::string(evar)));
+    MR_assert(res2>=0, "invalid value in DUCC0_NUM_THREADS/OMP_NUM_THREADS");
+    if (res2==0)
+      return res;
+    return std::min<size_t>(res, res2);
+    }();
+  return max_threads_;
+  }
+ 
+static thread_local bool in_parallel_region = false;
+int pin_info()
+  {
+  static const int pin_info_ = []()
+    {
+    auto evar=getenv("DUCC0_PIN_DISTANCE");
+    if (!evar)
+      return -1; // do nothing at all
+    auto res = stringToData<long>(trim(std::string(evar)));
+    return int(res);
+    }();
+  return pin_info_;
+  }
+int pin_offset()
+  {
+  static const int pin_offset_ = []()
+    {
+    auto evar=getenv("DUCC0_PIN_OFFSET");
+    if (!evar)
+      return 0;
+    auto res = stringToData<long>(trim(std::string(evar)));
+    return int(res);
+    }();
+  return pin_offset_;
+  }
+
+template <typename T> class concurrent_queue
+  {
+    std::queue<T> q_;
+    Mutex mut_;
+    std::atomic<size_t> size_=0;
+    using lock_t = LockGuard;
+
+  public:
+    void push(T val)
+      {
+      lock_t lock(mut_);
+      ++size_;
+      q_.push(std::move(val));
+      }
+
+    bool try_pop(T &val)
+      {
+      if (size_==0) return false;
+      lock_t lock(mut_);
+      // Queue might have been emptied while we acquired the lock
+      if (q_.empty()) return false;
+
+      val = std::move(q_.front());
+      --size_;
+      q_.pop();
+      return true;
+      }
+
+    bool empty() const { return size_==0; }
+  };
+
+#if __has_include(<pthread.h>) && defined(__linux__) && defined(_GNU_SOURCE)
+static void do_pinning(int ithread)
+  {
+  if (pin_info()==-1) return;
+  int num_proc = sysconf(_SC_NPROCESSORS_ONLN);
+  cpu_set_t cpuset;
+  CPU_ZERO(&cpuset);
+  int cpu_wanted = pin_offset() + ithread*pin_info();
+  MR_assert((cpu_wanted>=0)&&(cpu_wanted<num_proc), "bad CPU number requested");
+  CPU_SET(cpu_wanted, &cpuset);
+  pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
+  }
+#else
+static void do_pinning(int /*ithread*/)
+  { return; }
+#endif
+
+class ducc_thread_pool: public thread_pool
+  {
+  private:
+    // A reasonable guess, probably close enough for most hardware
+    static constexpr size_t cache_line_size = 64;
+    // align members with cache lines
+    struct alignas(cache_line_size) worker
+      {
+      std::thread thread;
+      CondVar work_ready;
+      Mutex mut;
+      std::atomic_flag busy_flag = ATOMIC_FLAG_INIT;
+      std::function<void()> work;
+
+      void worker_main(
+        std::atomic<bool> &shutdown_flag,
+        std::atomic<size_t> &unscheduled_tasks,
+        concurrent_queue<std::function<void()>> &overflow_work, size_t ithread)
+        {
+        in_parallel_region = true;
+        do_pinning(ithread);
+        using lock_t = UniqueLock;
+        bool expect_work = true;
+        while (!shutdown_flag || expect_work)
+          {
+          std::function<void()> local_work;
+          if (expect_work || unscheduled_tasks == 0)
+            {
+            lock_t lock(mut);
+            // Wait until there is work to be executed
+            work_ready.wait(lock, [&]{ return (work || shutdown_flag); });
+            local_work.swap(work);
+            expect_work = false;
+            }
+
+          bool marked_busy = false;
+          if (local_work)
+            {
+            marked_busy = true;
+            local_work();
+            }
+
+          if (!overflow_work.empty())
+            {
+            if (!marked_busy && busy_flag.test_and_set())
+              {
+              expect_work = true;
+              continue;
+              }
+            marked_busy = true;
+
+            while (overflow_work.try_pop(local_work))
+              {
+              --unscheduled_tasks;
+              local_work();
+              }
+            }
+
+          if (marked_busy) busy_flag.clear();
+          }
+        }
+      };
+
+    concurrent_queue<std::function<void()>> overflow_work_;
+    Mutex mut_;
+    std::vector<worker> workers_;
+    std::atomic<bool> shutdown_=false;
+    std::atomic<size_t> unscheduled_tasks_=0;
+    using lock_t = LockGuard;
+
+    void create_threads()
+      {
+      lock_t lock(mut_);
+      size_t nthreads=workers_.size();
+      for (size_t i=0; i<nthreads; ++i)
+        {
+        try
+          {
+          auto *worker = &workers_[i];
+          worker->busy_flag.clear();
+          worker->work = nullptr;
+          worker->thread = std::thread(
+            [worker, this, i]{ worker->worker_main(shutdown_, unscheduled_tasks_, overflow_work_, i); });
+          }
+        catch (...)
+          {
+          shutdown_locked();
+          throw;
+          }
+        }
+      }
+
+    void shutdown_locked()
+      {
+      shutdown_ = true;
+      for (auto &worker : workers_)
+        worker.work_ready.notify_all();
+
+      for (auto &worker : workers_)
+        if (worker.thread.joinable())
+          worker.thread.join();
+      }
+
+  public:
+    explicit ducc_thread_pool(size_t nthreads):
+      workers_(nthreads)
+      { create_threads(); }
+
+    //virtual
+    ~ducc_thread_pool() { shutdown(); }
+
+    //virtual
+    size_t nthreads() const { return workers_.size(); }
+
+    //virtual
+    size_t adjust_nthreads(size_t nthreads_in) const
+      {
+      if (in_parallel_region)
+        return 1;
+      if (nthreads_in==0)
+        return ducc0_max_threads();
+      return std::min(ducc0_max_threads(), nthreads_in);
+      }
+    //virtual
+    void submit(std::function<void()> work)
+      {
+      lock_t lock(mut_);
+      if (shutdown_)
+        throw std::runtime_error("Work item submitted after shutdown");
+
+      ++unscheduled_tasks_;
+
+      // First check for any idle workers and wake those
+      for (auto &worker : workers_)
+        if (!worker.busy_flag.test_and_set())
+          {
+          --unscheduled_tasks_;
+          {
+          lock_t lock(worker.mut);
+          worker.work = std::move(work);
+          worker.work_ready.notify_one();
+          }
+          return;
+          }
+
+      // If no workers were idle, push onto the overflow queue for later
+      overflow_work_.push(std::move(work));
+      }
+
+    void shutdown()
+      {
+      lock_t lock(mut_);
+      shutdown_locked();
+      }
+
+    void restart()
+      {
+      shutdown_ = false;
+      create_threads();
+      }
+  };
+
+// return a pointer to a singleton thread_pool, which is always available
+inline ducc_thread_pool *get_master_pool()
+  {
+  static auto master_pool = new ducc_thread_pool(ducc0_max_threads()-1);
+#if __has_include(<pthread.h>)
+  static std::once_flag f;
+  call_once(f,
+    []{
+    pthread_atfork(
+      +[]{ get_master_pool()->shutdown(); },  // prepare
+      +[]{ get_master_pool()->restart(); },   // parent
+      +[]{ get_master_pool()->restart(); }    // child
+      );
+    });
+#endif
+  return master_pool;
+  }
+
+thread_local thread_pool *active_pool = get_master_pool();
+
+thread_pool *set_active_pool(thread_pool *new_pool)
+  { return std::exchange(active_pool, new_pool); }
+thread_pool *get_active_pool()
+  {
+  if (!active_pool) active_pool = get_master_pool();
+  MR_assert(active_pool, "no thread pool active");
+  return active_pool;
+  }
+
+#endif
+
+#ifdef DUCC0_NO_LOWLEVEL_THREADING
+
+class ducc_pseudo_thread_pool: public thread_pool
+  {
+  public:
+    ducc_pseudo_thread_pool() {}
+
+    //virtual
+    size_t nthreads() const { return 1; }
+
+    //virtual
+    size_t adjust_nthreads(size_t /*nthreads_in*/) const
+      { return 1; }
+    //virtual
+    void submit(std::function<void()> work)
+      { work(); }
+  };
+
+// return a pointer to a singleton thread_pool, which is always available
+inline ducc_pseudo_thread_pool *get_master_pool()
+  {
+  static auto master_pool = new ducc_pseudo_thread_pool();
+  return master_pool;
+  }
+
+thread_local thread_pool *active_pool = get_master_pool();
+
+thread_pool *set_active_pool(thread_pool *new_pool)
+  { return std::exchange(active_pool, new_pool); }
+thread_pool *get_active_pool()
+  {
+  MR_assert(active_pool!=nullptr, "no thread pool active");
+  return active_pool;
+  }
+
+#endif
+
+size_t max_threads()
+  { return get_active_pool()->nthreads()+1; }
+size_t adjust_nthreads(size_t nthreads_in)
+  { return get_active_pool()->adjust_nthreads(nthreads_in); }
+
+class Distribution
+  {
+  private:
+    size_t nthreads_;
+    Mutex mut_;
+    size_t nwork_;
+    size_t cur_;
+    std::atomic<size_t> cur_dynamic_;
+    size_t chunksize_;
+    double fact_max_;
+    struct alignas(64) spaced_size_t { size_t v; }; 
+    std::vector<spaced_size_t> nextstart;
+    enum SchedMode { SINGLE, STATIC, DYNAMIC, GUIDED };
+    SchedMode mode;
+    bool single_done;
+
+    void thread_map(std::function<void(Scheduler &)> f);
+
+  public:
+    size_t nthreads() const { return nthreads_; }
+
+    void execSingle(size_t nwork, std::function<void(Scheduler &)> f)
+      {
+      mode = SINGLE;
+      single_done = false;
+      nwork_ = nwork;
+      nthreads_ = 1;
+      thread_map(std::move(f));
+      }
+    void execStatic(size_t nwork, size_t nthreads, size_t chunksize,
+      std::function<void(Scheduler &)> f)
+      {
+      mode = STATIC;
+      nthreads_ = adjust_nthreads(nthreads);
+      nwork_ = nwork;
+      chunksize_ = (chunksize<1) ? (nwork_+nthreads_-1)/nthreads_
+                                 : chunksize;
+      if (chunksize_>=nwork_)
+        return execSingle(nwork_, std::move(f));
+// if there are fewer chunks than threads, reduce nthreads
+      nthreads_ = std::min(nthreads_, (nwork_+chunksize_-1)/chunksize_);
+      nextstart.resize(nthreads_);
+      for (size_t i=0; i<nextstart.size(); ++i)
+        nextstart[i].v = i*chunksize_;
+      thread_map(std::move(f));
+      }
+    void execDynamic(size_t nwork, size_t nthreads, size_t chunksize,
+      std::function<void(Scheduler &)> f)
+      {
+      mode = DYNAMIC;
+      nthreads_ = adjust_nthreads(nthreads);
+      nwork_ = nwork;
+      chunksize_ = (chunksize<1) ? 1 : chunksize;
+      if (chunksize_ >= nwork)
+        return execSingle(nwork, std::move(f));
+      if (chunksize_*nthreads_>=nwork_)
+        return execStatic(nwork, nthreads, chunksize_, std::move(f));
+      cur_dynamic_ = 0;
+      thread_map(std::move(f));
+      }
+    void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min,
+      double fact_max, std::function<void(Scheduler &)> f)
+      {
+      mode = GUIDED;
+      nthreads_ = adjust_nthreads(nthreads);
+      nwork_ = nwork;
+      chunksize_ = (chunksize_min<1) ? 1 : chunksize_min;
+      if (chunksize_*nthreads_>=nwork_)
+        return execStatic(nwork, nthreads, chunksize_, std::move(f));
+      fact_max_ = fact_max;
+      cur_ = 0;
+      thread_map(std::move(f));
+      }
+    void execParallel(size_t nthreads, std::function<void(Scheduler &)> f)
+      {
+      mode = STATIC;
+      nthreads_ = adjust_nthreads(nthreads);
+      nwork_ = nthreads_;
+      chunksize_ = 1;
+      thread_map(std::move(f));
+      }
+    Range getNext(size_t thread_id)
+      {
+      switch (mode)
+        {
+        case SINGLE:
+          {
+          if (single_done) return Range();
+          single_done=true;
+          return Range(0, nwork_);
+          }
+        case STATIC:
+          {
+          if (nextstart[thread_id].v>=nwork_) return Range();
+          size_t lo=nextstart[thread_id].v;
+          size_t hi=std::min(lo+chunksize_,nwork_);
+          nextstart[thread_id].v += nthreads_*chunksize_;
+          return Range(lo, hi);
+          }
+        case DYNAMIC:
+          {
+          auto curval = cur_dynamic_.fetch_add(chunksize_);
+          return Range(std::min(curval, nwork_),
+                       std::min(curval+chunksize_, nwork_));
+          }
+        case GUIDED:
+          {
+          LockGuard lck(mut_);
+          if (cur_>=nwork_) return Range();
+          auto rem = nwork_-cur_;
+          size_t tmp = size_t((fact_max_*double(rem))/double(nthreads_));
+          auto sz = std::min(rem, std::max(chunksize_, tmp));
+          size_t lo=cur_;
+          cur_+=sz;
+          size_t hi=cur_;
+          return Range(lo, hi);
+          }
+        }
+      return Range();
+      }
+  };
+
+class MyScheduler: public Scheduler
+  {
+  private:
+    Distribution &dist_;
+    size_t ithread_;
+
+  public:
+    MyScheduler(Distribution &dist, size_t ithread)
+      : dist_(dist), ithread_(ithread) {}
+    virtual size_t num_threads() const { return dist_.nthreads(); }
+    virtual size_t thread_num() const { return ithread_; }
+    virtual Range getNext() { return dist_.getNext(ithread_); }
+  };
+
+template<typename T> class ScopedValueChanger
+  {
+  private:
+    T &object;
+    T original_value;
+
+  public:
+    ScopedValueChanger(T &object_, T new_value)
+      : object(object_), original_value(object_) { object=new_value; }
+    ~ScopedValueChanger()
+      { object=original_value; }
+  };
+
+#define DUCC0_HIERARCHICAL_SUBMISSION
+#ifdef DUCC0_HIERARCHICAL_SUBMISSION
+
+// The next two definitions are taken from TensorFlow sources.
+// Copyright 2015 The TensorFlow Authors.
+
+// Basic y-combinator implementation.
+template <class Func> struct YCombinatorImpl {
+  Func func;
+  template <class... Args>
+  decltype(auto) operator()(Args&&... args) const {
+    return func(*this, std::forward<Args>(args)...);
+  }
+};
+
+template <class Func> YCombinatorImpl<std::decay_t<Func>> YCombinator(Func&& func) {
+  return YCombinatorImpl<std::decay_t<Func>>{std::forward<Func>(func)};
+}
+
+#endif
+
+void Distribution::thread_map(std::function<void(Scheduler &)> f)
+  {
+  if (nthreads_ == 1)
+    {
+    MyScheduler sched(*this, 0);
+    f(sched);
+    return;
+    }
+
+  std::exception_ptr ex;
+  Mutex ex_mut;
+  // we "copy" the currently active thread pool to all executing threads
+  // during the execution of f. This ensures that possible nested parallel
+  // regions are handled by the same pool and not by the one that happens
+  // to be active on the worker threads.
+  // Alternatively we could put a "no-threading" thread pool onto the executing
+  // threads, which executes everything sequentially on its own thread,
+  // automatically prohibiting nested parallelism.
+  auto pool = get_active_pool();
+
+#ifdef DUCC0_HIERARCHICAL_SUBMISSION
+
+  latch counter(nthreads_);
+  // distribute work to helper threads, in a recursive fashion
+  auto new_f = YCombinator([this, &f, &counter, &ex, &ex_mut, pool](auto &new_f, size_t istart, size_t step) -> void {
+    try
+      {
+      ScopedValueChanger<bool> changer(in_parallel_region, true);
+      ScopedUseThreadPool guard(*pool);
+      for(; step>0; step>>=1)
+        if(istart+step<nthreads_)
+          pool->submit([&new_f, istart, step]()
+            {new_f(istart+step, step>>1);});
+      MyScheduler sched(*this, istart);
+      f(sched);
+      }
+    catch (...)
+      {
+      LockGuard lock(ex_mut);
+      ex = std::current_exception();
+      }
+    counter.count_down();
+    });
+
+  size_t biggest_step=1;
+  while (biggest_step*2<nthreads_) biggest_step<<=1;
+  new_f(0, biggest_step);
+
+#else  // sequential submission
+
+  latch counter(nthreads_-1);
+  for (size_t i=1; i<nthreads_; ++i)
+    {
+    pool->submit(
+      [this, &f, i, &counter, &ex, &ex_mut, pool] {
+      try
+        {
+        ScopedUseThreadPool guard(*pool);
+        MyScheduler sched(*this, i);
+        f(sched);
+        }
+      catch (...)
+        {
+        LockGuard lock(ex_mut);
+        ex = std::current_exception();
+        }
+      counter.count_down();
+      });
+    }
+  {
+  // do remaining work directly on this thread
+  ScopedValueChanger<bool> changer(in_parallel_region, true);
+  MyScheduler sched(*this, 0);
+  f(sched);
+  }
+
+#endif
+#undef DUCC0_HIERARCHICAL_SUBMISSION
+
+  counter.wait();
+  if (ex)
+    std::rethrow_exception(ex);
+  }
+
+void execSingle(size_t nwork, std::function<void(Scheduler &)> func)
+  {
+  Distribution dist;
+  dist.execSingle(nwork, std::move(func));
+  }
+void execStatic(size_t nwork, size_t nthreads, size_t chunksize,
+  std::function<void(Scheduler &)> func)
+  {
+  Distribution dist;
+  dist.execStatic(nwork, nthreads, chunksize, std::move(func));
+  }
+void execDynamic(size_t nwork, size_t nthreads, size_t chunksize,
+  std::function<void(Scheduler &)> func)
+  {
+  Distribution dist;
+  dist.execDynamic(nwork, nthreads, chunksize, std::move(func));
+  }
+void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min,
+  double fact_max, std::function<void(Scheduler &)> func)
+  {
+  Distribution dist;
+  dist.execGuided(nwork, nthreads, chunksize_min, fact_max, std::move(func));
+  }
+void execParallel(size_t nthreads, std::function<void(Scheduler &)> func)
+  {
+  Distribution dist;
+  dist.execParallel(nthreads, std::move(func));
+  }
+void execParallel(size_t nthreads, std::function<void(size_t)> func)
+  {
+  Distribution dist;
+  dist.execParallel(nthreads, [&](Scheduler &sched)
+    { func(sched.thread_num()); });
+  }
+void execParallel(size_t work_lo, size_t work_hi, size_t nthreads,
+  std::function<void(size_t, size_t)> func)
+  {
+  nthreads = adjust_nthreads(nthreads);
+  execParallel(nthreads, [&](Scheduler &sched)
+    {
+    auto tid = sched.thread_num();
+    auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi);
+    func(lo, hi);
+    });
+  }
+void execParallel(size_t work_lo, size_t work_hi, size_t nthreads,
+  std::function<void(size_t, size_t, size_t)> func)
+  {
+  nthreads = adjust_nthreads(nthreads);
+  execParallel(nthreads, [&](Scheduler &sched)
+    {
+    auto tid = sched.thread_num();
+    auto [lo, hi] = calcShare(nthreads, tid, work_lo, work_hi);
+    func(tid, lo, hi);
+    });
+  }
+
+}}
diff --git a/contrib/ducc0/infra/threading.h b/contrib/ducc0/infra/threading.h
new file mode 100644
index 000000000..21fb8300f
--- /dev/null
+++ b/contrib/ducc0/infra/threading.h
@@ -0,0 +1,329 @@
+/** \file ducc0/infra/threading.h
+ *  Mulithreading support, similar to functionality provided by OpenMP
+ *
+ * \copyright Copyright (C) 2019-2023 Peter Bell, Max-Planck-Society
+ * \authors Peter Bell, Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_THREADING_H
+#define DUCC0_THREADING_H
+
+// Low level threading support can be influenced by the following macros:
+// - DUCC0_NO_LOWLEVEL_THREADING: if defined, multithreading is disabled
+//   and all parallel regions will be executed sequentially
+//   on the invoking thread.
+// - DUCC0_CUSTOM_LOWLEVEL_THREADING: if defined, external definitions of
+//   Mutex, UniqueLock, LockGuard, CondVar, set_active_pool(),
+//   and get active_pool() must be supplied in "ducc0_custom_lowlevel_threading.h"
+//   and the code will use those.
+// Both macros must not be defined at the same time.
+// If neither macro is defined, standard ducc0 multihreading will be active.
+
+#if (defined(DUCC0_NO_LOWLEVEL_THREADING) && defined(DUCC0_CUSTOM_LOWLEVEL_THREADING))
+static_assert(false, "DUCC0_NO_LOWLEVEL_THREADING and DUCC0_CUSTOMLOWLEVEL_THREADING must not be both defined");
+#endif
+
+#if defined(DUCC0_STDCXX_LOWLEVEL_THREADING)
+static_assert(false, "DUCC0_STDCXX_LOWLEVEL_THREADING must not be defined externally");
+#endif
+
+#if ((!defined(DUCC0_NO_LOWLEVEL_THREADING)) && (!defined(DUCC0_CUSTOM_LOWLEVEL_THREADING)))
+#define DUCC0_STDCXX_LOWLEVEL_THREADING
+#endif
+
+#include <cstddef>
+#include <functional>
+#include <optional>
+#include <vector>
+
+// threading-specific headers
+#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING
+#include <mutex>
+#include <condition_variable>
+#endif
+
+#ifdef DUCC0_NO_LOWLEVEL_THREADING
+// no headers needed
+#endif
+
+namespace ducc0 {
+namespace detail_threading {
+
+using std::size_t;
+
+/// Abstract base class for minimalistic thread pool functionality
+class thread_pool
+  {
+  public:
+    virtual ~thread_pool() {}
+    /// Returns the total number of threads managed by the pool
+    virtual size_t nthreads() const = 0;
+    /** "Normalizes" a requested number of threads. A useful convention could be
+        return (nthreads_in==0) ? nthreads() : min(nthreads(), nthreads_in); */ 
+    virtual size_t adjust_nthreads(size_t nthreads_in) const = 0;
+    virtual void submit(std::function<void()> work) = 0;
+  };
+
+}}
+
+#ifdef DUCC0_CUSTOM_LOWLEVEL_THREADING
+#include "ducc0_custom_lowlevel_threading.h"
+#endif
+
+namespace ducc0 {
+
+namespace detail_threading {
+
+thread_pool *set_active_pool(thread_pool *new_pool);
+thread_pool *get_active_pool();
+
+// define threading related types dependent on the underlying implementation
+#ifdef DUCC0_STDCXX_LOWLEVEL_THREADING
+using Mutex = std::mutex;
+using UniqueLock = std::unique_lock<std::mutex>;
+using LockGuard = std::lock_guard<std::mutex>;
+using CondVar = std::condition_variable;
+#endif
+
+#ifdef DUCC0_NO_LOWLEVEL_THREADING
+struct Mutex
+  {
+  void lock(){}
+  void unlock(){}
+  };
+struct LockGuard
+  {
+  LockGuard(const Mutex &){}
+  };
+struct UniqueLock
+  {
+  UniqueLock(const Mutex &){}
+  void lock() {}
+  void unlock() {}
+  };
+struct CondVar
+  {
+  template<class Predicate>
+    void wait(UniqueLock &, Predicate) {}
+  void notify_one() noexcept {}
+  void notify_all() noexcept {}
+  };
+#endif
+
+using std::size_t;
+
+class ScopedUseThreadPool
+  {
+  private:
+    thread_pool *old_pool_;
+  public:
+    ScopedUseThreadPool(thread_pool &pool)
+      { old_pool_ = set_active_pool(&pool); }
+    ~ScopedUseThreadPool()
+      { set_active_pool(old_pool_); }
+  };
+
+/// Index range describing a chunk of work inside a parallelized loop
+struct Range
+  {
+  size_t lo, //< first index of the chunk
+         hi; //< one-past-last index of the chunk
+  Range() : lo(0), hi(0) {}
+  Range(size_t lo_, size_t hi_) : lo(lo_), hi(hi_) {}
+  /// Returns true iff the chunk is not empty
+  operator bool() const { return hi>lo; }
+  };
+
+/// Class supplied to parallel regions, which allows them to determine their
+/// work chunks.
+class Scheduler
+  {
+  public:
+    virtual ~Scheduler() {}
+    /// Returns the number of threads working in this parallel region
+    virtual size_t num_threads() const = 0;
+    /// Returns the number of this thread, from the range 0 to num_threads()-1.
+    virtual size_t thread_num() const = 0;
+    /// Returns information about the next chunk of work.
+    /// If this chunk is empty, the work on this thread is done.
+    virtual Range getNext() = 0;
+  };
+
+/** Returns the maximum number of threads that are supported by currently
+    active thread pool. */
+size_t max_threads();
+size_t adjust_nthreads(size_t nthreads);
+
+/// Execute \a func over \a nwork work items, on a single thread.
+void execSingle(size_t nwork,
+  std::function<void(Scheduler &)> func);
+/// Execute \a func over \a nwork work items, on \a nthreads threads.
+/** Chunks will have the size \a chunksize, except for the last one which
+ *  may be smaller.
+ *
+ *  Chunks are statically assigned to threads at startup. */
+void execStatic(size_t nwork, size_t nthreads, size_t chunksize,
+  std::function<void(Scheduler &)> func);
+/// Execute \a func over \a nwork work items, on \a nthreads threads.
+/** Chunks will have the size \a chunksize, except for the last one which
+ *  may be smaller.
+ *
+ *  Chunks are assigned dynamically to threads;whenever a thread is finished
+ *  with its current chunk, it will obtain the next one from the list of
+ *  remaining chunks. */
+void execDynamic(size_t nwork, size_t nthreads, size_t chunksize,
+  std::function<void(Scheduler &)> func);
+void execGuided(size_t nwork, size_t nthreads, size_t chunksize_min,
+  double fact_max, std::function<void(Scheduler &)> func);
+/// Execute \a func on \a nthreads threads.
+/** Work subdivision must be organized within \a func. */
+void execParallel(size_t nthreads, std::function<void(Scheduler &)> func);
+/// Execute \a func on \a nthreads threads, passing only the thread number.
+/** Work subdivision must be organized within \a func. */
+void execParallel(size_t nthreads, std::function<void(size_t)> func);
+/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads.
+/** Work items are subdivided fairly among threads. */
+void execParallel(size_t work_lo, size_t work_hi, size_t nthreads,
+  std::function<void(size_t, size_t)> func);
+/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads.
+/** Work items are subdivided fairly among threads. */
+inline void execParallel(size_t nwork, size_t nthreads,
+  std::function<void(size_t, size_t)> func)
+  { execParallel(0, nwork, nthreads, func); }
+/// Execute \a func on work items [\a lo; \a hi[ over \a nthreads threads.
+/** The first argument to \a func is the thread number.
+ *
+ *  Work items are subdivided fairly among threads. */
+void execParallel(size_t work_lo, size_t work_hi, size_t nthreads,
+  std::function<void(size_t, size_t, size_t)> func);
+/// Execute \a func on work items [0; \a nwork[ over \a nthreads threads.
+/** The first argument to \a func is the thread number.
+ *
+ *  Work items are subdivided fairly among threads. */
+inline void execParallel(size_t nwork, size_t nthreads,
+  std::function<void(size_t, size_t, size_t)> func)
+  { execParallel(0, nwork, nthreads, func); }
+
+template<typename T> class Worklist
+  {
+  private:
+    Mutex mtx;
+    CondVar cv;
+    size_t nworking{0};
+    std::vector<T> items;
+
+  public:
+    Worklist(const std::vector<T> &items_)
+      : items(items_) {}
+
+    std::optional<T> get_item()
+      {
+      UniqueLock lck(mtx);
+      if ((--nworking==0) && items.empty()) cv.notify_all();
+      cv.wait(lck,[&](){return (!items.empty()) || (nworking==0);});
+      if (!items.empty())
+        {
+        auto res = items.back();
+        items.pop_back();
+        ++nworking;
+        return res;
+        }
+      else
+        return {};
+      }
+    void startup()
+      {
+      LockGuard lck(mtx);
+      ++nworking;
+      }
+    void put_item(const T &item)
+      {
+      LockGuard lck(mtx);
+      items.push_back(item);
+      cv.notify_one();
+      }
+  };
+  
+/// Execute \a func on work items in \a items over \a nthreads threads.
+/** While processing a work item, \a func may submit further items to the list
+ *  of work items. For this purpose, \a func must take a const T &
+ *  (the work item to be processed) as well as a function which also takes
+ *  a const T & (the insert function). Work items will be assigned whenever a
+ *  thread becomes available. */
+template<typename T, typename Func> auto execWorklist
+  (size_t nthreads, const std::vector<T> &items, Func &&func)
+  {
+  Worklist<T> wl(items);
+  execParallel(nthreads, [&wl, &func](auto &) {
+    wl.startup();
+    while(auto wrk=wl.get_item())
+      func(wrk.value(), [&wl](const T &item){wl.put_item(item);});
+    });
+  }
+
+} // end of namespace detail_threading
+
+using detail_threading::Mutex;
+using detail_threading::LockGuard;
+using detail_threading::UniqueLock;
+using detail_threading::CondVar;
+using detail_threading::thread_pool;
+using detail_threading::ScopedUseThreadPool;
+using detail_threading::max_threads;
+using detail_threading::adjust_nthreads;
+using detail_threading::Scheduler;
+using detail_threading::execSingle;
+using detail_threading::execStatic;
+using detail_threading::execDynamic;
+using detail_threading::execGuided;
+using detail_threading::execParallel;
+using detail_threading::execWorklist;
+
+} // end of namespace ducc0
+
+#endif
diff --git a/contrib/ducc0/infra/useful_macros.h b/contrib/ducc0/infra/useful_macros.h
new file mode 100644
index 000000000..eaef26779
--- /dev/null
+++ b/contrib/ducc0/infra/useful_macros.h
@@ -0,0 +1,74 @@
+/*
+This file is part of the ducc library.
+
+Copyright (C) 2010-2022 Max-Planck-Society
+
+Author: Martin Reinecke
+*/
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_USEFUL_MACROS_H
+#define DUCC0_USEFUL_MACROS_H
+
+#if defined(__GNUC__)
+#define DUCC0_NOINLINE [[gnu::noinline]]
+#define DUCC0_RESTRICT __restrict__
+#define DUCC0_PREFETCH_R(addr) __builtin_prefetch(addr);
+#define DUCC0_PREFETCH_W(addr) __builtin_prefetch(addr,1);
+#elif defined(_MSC_VER)
+#define DUCC0_NOINLINE __declspec(noinline)
+#define DUCC0_RESTRICT __restrict
+#define DUCC0_PREFETCH_R(addr)
+#define DUCC0_PREFETCH_W(addr)
+#else
+#define DUCC0_NOINLINE
+#define DUCC0_RESTRICT
+#define DUCC0_PREFETCH_R(addr)
+#define DUCC0_PREFETCH_W(addr)
+#endif
+
+#endif
diff --git a/contrib/ducc0/math/cmplx.h b/contrib/ducc0/math/cmplx.h
new file mode 100644
index 000000000..522a3bdda
--- /dev/null
+++ b/contrib/ducc0/math/cmplx.h
@@ -0,0 +1,108 @@
+/** \file ducc0/math/cmplx.h
+ *  Minimalistic complex number class
+ *
+ *  \copyright Copyright (C) 2019-2023 Max-Planck-Society
+ *  \author Martin Reinecke
+ */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_CMPLX_H
+#define DUCC0_CMPLX_H
+
+namespace ducc0 {
+
+/// Very basic class representing complex numbers
+/** Meant exclusively for internal low-level use, e.g. in FFT routines. */
+template<typename T> struct Cmplx {
+  T r, i;
+  Cmplx() {}
+  constexpr Cmplx(T r_, T i_) : r(r_), i(i_) {}
+  constexpr Cmplx(T r_) : r(r_), i(T(0)) {}
+  void Set(T r_, T i_) { r=r_; i=i_; }
+  void Set(T r_) { r=r_; i=T(0); }
+  void Split(T &r_, T &i_) const { r_=r; i_=i; }
+  void SplitConj(T &r_, T &i_) const { r_=r; i_=-i; }
+  Cmplx &operator+= (const Cmplx &other)
+    { r+=other.r; i+=other.i; return *this; }
+  template<typename T2>Cmplx &operator*= (T2 other)
+    { r*=other; i*=other; return *this; }
+  template<typename T2>Cmplx &operator*= (const Cmplx<T2> &other)
+    {
+    T tmp = r*other.r - i*other.i;
+    i = r*other.i + i*other.r;
+    r = tmp;
+    return *this;
+    }
+  Cmplx conj() const { return {r, -i}; }
+  template<typename T2>Cmplx &operator+= (const Cmplx<T2> &other)
+    { r+=other.r; i+=other.i; return *this; }
+  template<typename T2>Cmplx &operator-= (const Cmplx<T2> &other)
+    { r-=other.r; i-=other.i; return *this; }
+  template<typename T2> auto operator* (const T2 &other) const
+    -> Cmplx<decltype(r*other)>
+    { return {r*other, i*other}; }
+  template<typename T2> auto operator+ (const Cmplx<T2> &other) const
+    -> Cmplx<decltype(r+other.r)>
+    { return {r+other.r, i+other.i}; }
+  template<typename T2> auto operator- (const Cmplx<T2> &other) const
+    -> Cmplx<decltype(r+other.r)>
+    { return {r-other.r, i-other.i}; }
+  template<typename T2> auto operator* (const Cmplx<T2> &other) const
+    -> Cmplx<decltype(r+other.r)>
+    { return {r*other.r-i*other.i, r*other.i + i*other.r}; }
+  template<bool fwd, typename T2> auto special_mul (const Cmplx<T2> &other) const
+    -> Cmplx<decltype(r+other.r)>
+    {
+    using Tres = Cmplx<decltype(r+other.r)>;
+    return fwd ? Tres(r*other.r+i*other.i, i*other.r-r*other.i)
+               : Tres(r*other.r-i*other.i, r*other.i+i*other.r);
+    }
+  };
+
+}
+
+#endif
diff --git a/contrib/ducc0/math/unity_roots.h b/contrib/ducc0/math/unity_roots.h
new file mode 100644
index 000000000..09df542b2
--- /dev/null
+++ b/contrib/ducc0/math/unity_roots.h
@@ -0,0 +1,241 @@
+/* Copyright (C) 2019-2021 Max-Planck-Society
+   Author: Martin Reinecke */
+
+/* SPDX-License-Identifier: BSD-3-Clause OR GPL-2.0-or-later */
+
+/*
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright notice, this
+  list of conditions and the following disclaimer in the documentation and/or
+  other materials provided with the distribution.
+* Neither the name of the copyright holder nor the names of its contributors may
+  be used to endorse or promote products derived from this software without
+  specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+/*
+ *  This code is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This code is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this code; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef DUCC0_UNITY_ROOTS_H
+#define DUCC0_UNITY_ROOTS_H
+
+#include <cmath>
+#include <cstddef>
+#include <type_traits>
+#include <vector>
+
+namespace ducc0 {
+
+namespace detail_unity_roots {
+
+using namespace std;
+
+template<typename T, typename Tc> class UnityRoots
+  {
+  private:
+    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
+    struct cmplx_ { Thigh r, i; };
+    size_t N, mask, shift;
+    vector<cmplx_> v1, v2;
+
+    static cmplx_ calc(size_t x, size_t n, Thigh ang)
+      {
+      x<<=3;
+      if (x<4*n) // first half
+        {
+        if (x<2*n) // first quadrant
+          {
+          if (x<n) return {cos(Thigh(x)*ang), sin(Thigh(x)*ang)};
+          return {sin(Thigh(2*n-x)*ang), cos(Thigh(2*n-x)*ang)};
+          }
+        else // second quadrant
+          {
+          x-=2*n;
+          if (x<n) return {-sin(Thigh(x)*ang), cos(Thigh(x)*ang)};
+          return {-cos(Thigh(2*n-x)*ang), sin(Thigh(2*n-x)*ang)};
+          }
+        }
+      else
+        {
+        x=8*n-x;
+        if (x<2*n) // third quadrant
+          {
+          if (x<n) return {cos(Thigh(x)*ang), -sin(Thigh(x)*ang)};
+          return {sin(Thigh(2*n-x)*ang), -cos(Thigh(2*n-x)*ang)};
+          }
+        else // fourth quadrant
+          {
+          x-=2*n;
+          if (x<n) return {-sin(Thigh(x)*ang), -cos(Thigh(x)*ang)};
+          return {-cos(Thigh(2*n-x)*ang), -sin(Thigh(2*n-x)*ang)};
+          }
+        }
+      }
+#if 0  // alternative version, similar speed, but maybe a bit more accurate
+    static cmplx_ calc2(size_t x, size_t n)
+      {
+      static constexpr Thigh pi = Thigh(3.141592653589793238462643383279502884197L);
+      Thigh n4 = Thigh(n<<2);
+
+      x<<=3;
+      if (x<4*n) // first half
+        {
+        if (x<2*n) // first quadrant
+          {
+          if (x<n)
+            {
+            auto ang = (x/n4)*pi;
+            return {cos(ang), sin(ang)};
+            }
+          auto ang = ((2*n-x)/n4)*pi;
+          return {sin(ang), cos(ang)};
+          }
+        else // second quadrant
+          {
+          x-=2*n;
+          if (x<n)
+            {
+            auto ang = (x/n4)*pi;
+            return {-sin(ang), cos(ang)};
+            }
+          auto ang = ((2*n-x)/n4)*pi;
+          return {-cos(ang), sin(ang)};
+          }
+        }
+      else
+        {
+        x=8*n-x;
+        if (x<2*n) // third quadrant
+          {
+          if (x<n)
+            {
+            auto ang = (x/n4)*pi;
+            return {cos(ang), -sin(ang)};
+            }
+          auto ang = ((2*n-x)/n4)*pi;
+          return {sin(ang), -cos(ang)};
+          }
+        else // fourth quadrant
+          {
+          x-=2*n;
+          if (x<n)
+            {
+            auto ang = (x/n4)*pi;
+            return {-sin(ang), -cos(ang)};
+            }
+          auto ang = ((2*n-x)/n4)*pi;
+          return {-cos(ang), -sin(ang)};
+          }
+        }
+      }
+#endif
+
+  public:
+    UnityRoots(size_t n)
+      : N(n)
+      {
+      constexpr auto pi = 3.141592653589793238462643383279502884197L;
+      Thigh ang = Thigh(0.25L*pi/n);
+      size_t nval = (n+2)/2;
+      shift = 1;
+      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
+      mask = (size_t(1)<<shift)-1;
+      v1.resize(mask+1);
+      v1[0]={Thigh(1), Thigh(0)};
+      for (size_t i=1; i<v1.size(); ++i)
+        v1[i]=calc(i,n,ang);
+      v2.resize((nval+mask)/(mask+1));
+      v2[0]={Thigh(1), Thigh(0)};
+      for (size_t i=1; i<v2.size(); ++i)
+        v2[i]=calc(i*(mask+1),n,ang);
+      }
+
+    size_t size() const { return N; }
+
+    Tc operator[](size_t idx) const
+      {
+      if (2*idx<=N)
+        {
+        auto x1=v1[idx&mask], x2=v2[idx>>shift];
+        return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
+        }
+      idx = N-idx;
+      auto x1=v1[idx&mask], x2=v2[idx>>shift];
+      return Tc(T(x1.r*x2.r-x1.i*x2.i), -T(x1.r*x2.i+x1.i*x2.r));
+      }
+  };
+
+template<typename T, typename Tc> class MultiExp
+  {
+  private:
+    using Thigh = typename conditional<(sizeof(T)>sizeof(double)), T, double>::type;
+    struct cmplx_ { Thigh r, i; };
+    size_t N, mask, shift;
+    vector<cmplx_> v1, v2;
+
+  public:
+    MultiExp(T ang0, size_t n)
+      : N(n)
+      {
+      Thigh ang = ang0;
+      size_t nval = n+2;
+      shift = 1;
+      while((size_t(1)<<shift)*(size_t(1)<<shift) < nval) ++shift;
+      mask = (size_t(1)<<shift)-1;
+      v1.resize(mask+1);
+      v1[0]={Thigh(1), Thigh(0)};
+      for (size_t i=1; i<v1.size(); ++i)
+        v1[i] = {cos(i*ang), sin(i*ang)};
+      v2.resize((nval+mask)/(mask+1));
+      v2[0]={Thigh(1), Thigh(0)};
+      for (size_t i=1; i<v2.size(); ++i)
+        v2[i] = {cos((i*(mask+1))*ang), sin((i*(mask+1))*ang)};
+      }
+
+    size_t size() const { return N; }
+
+    Tc operator[](size_t idx) const
+      {
+      auto x1=v1[idx&mask], x2=v2[idx>>shift];
+      return Tc(T(x1.r*x2.r-x1.i*x2.i), T(x1.r*x2.i+x1.i*x2.r));
+      }
+  };
+
+}
+
+using detail_unity_roots::UnityRoots;
+using detail_unity_roots::MultiExp;
+
+}
+
+#endif
diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp
index eb7189da0..bc9b36b29 100644
--- a/examples/guru1d1.cpp
+++ b/examples/guru1d1.cpp
@@ -20,9 +20,9 @@ int main(int argc, char* argv[])
    Barnett 2/27/20
 
    Compile on linux with (or see ../makefile):
-   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1  -lfftw3 -lfftw3_omp -lm
+   g++-7 -std=c++17 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1
 
-   Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp
+   Or if you have built a single-core library, remove -fopenmp
 
    Usage: ./guru1d1
 */
diff --git a/examples/guru1d1c.c b/examples/guru1d1c.c
index 7ad036f4b..9be63a0d5 100644
--- a/examples/guru1d1c.c
+++ b/examples/guru1d1c.c
@@ -12,9 +12,9 @@ int main(int argc, char* argv[])
    C complex type, with a math check. Barnett 6/22/20.
 
    Compile on linux with:
-   gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c  -lfftw3 -lfftw3_omp -lm -lstdc++
+   gcc-7 -fopenmp guru1d1c.c -I../include ../lib-static/libfinufft.a -o guru1d1c -lm -lstdc++
 
-   Or if you have built a single-core library, remove -fopenmp and -lfftw3_omp
+   Or if you have built a single-core library, remove -fopenmp
 
    Usage: ./guru1d1c.  See also: guru1d1
 */
diff --git a/examples/guru1d1f.cpp b/examples/guru1d1f.cpp
index a46c4a735..72b706d77 100644
--- a/examples/guru1d1f.cpp
+++ b/examples/guru1d1f.cpp
@@ -19,9 +19,9 @@ int main(int argc, char* argv[])
    Barnett 7/5/20
 
    Compile on linux with:
-   g++-7 -std=c++14 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f  -lfftw3f -lfftw3f_omp -lm
+   g++-7 -std=c++17 -fopenmp guru1d1f.cpp -I../include ../lib-static/libfinufft.a -o guru1d1f
 
-   Or if you have built a single-core library, remove -fopenmp and -lfftw3f_omp
+   Or if you have built a single-core library, remove -fopenmp
 
    Usage: ./guru1d1f
 */
diff --git a/examples/guru2d1.cpp b/examples/guru2d1.cpp
index 06d25e064..13a29928b 100644
--- a/examples/guru2d1.cpp
+++ b/examples/guru2d1.cpp
@@ -13,9 +13,9 @@ int main(int argc, char *argv[]){
    except illustrates the guru interface.
 
    Compile multithreaded with
-   g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lfftw3_omp -lm
+   g++ -fopenmp guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1
    single core with:
-   g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1 -lfftw3 -lm
+   g++ guru2d1.cpp -I ../src ../lib-static/libfinufft.a -o guru2d1
    
    Usage:  ./guru2d1
 */
diff --git a/examples/many1d1.cpp b/examples/many1d1.cpp
index 8176007c9..353db1504 100644
--- a/examples/many1d1.cpp
+++ b/examples/many1d1.cpp
@@ -12,9 +12,9 @@ int main(int argc, char* argv[])
    double complex vectors, with a math test.
 
    Compile with:
-   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lfftw3_omp -lm
+   g++ -fopenmp many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1
    or if you have built a single-core version:
-   g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1 -lfftw3 -lm
+   g++ many1d1.cpp -I../include ../lib-static/libfinufft.a -o many1d1
 
    Usage: ./many1d1
 */
diff --git a/examples/simple1d1.cpp b/examples/simple1d1.cpp
index cb1b9e493..7dac780ea 100644
--- a/examples/simple1d1.cpp
+++ b/examples/simple1d1.cpp
@@ -15,9 +15,9 @@ int main(int argc, char* argv[])
    Double-precision version (see simple1d1f for single-precision)
 
    Compile with:
-   g++ -fopenmp simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lfftw3_omp -lm
+   g++ -fopenmp simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1
    or if you have built a single-core version:
-   g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1 -lfftw3 -lm
+   g++ simple1d1.cpp -I../include ../lib-static/libfinufft.a -o simple1d1
 
    Usage: ./simple1d1
 */
diff --git a/examples/simple1d1c.c b/examples/simple1d1c.c
index b3c718659..6a9ac7e69 100644
--- a/examples/simple1d1c.c
+++ b/examples/simple1d1c.c
@@ -13,9 +13,9 @@ int main(int argc, char* argv[])
    with a math test. Double-precision. C99 style. opts is struct not ptr to it.
 
    Compile with:
-   gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lfftw3_omp -lm -lstdc++
+   gcc -fopenmp example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lm -lstdc++
    or if you have built a single-core version:
-   gcc example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lfftw3 -lm -lstdc++
+   gcc example1d1c.c -I../include ../lib-static/libfinufft.a -o example1d1c -lm -lstdc++
 
    Usage: ./example1d1c
 */
diff --git a/examples/simple1d1cf.c b/examples/simple1d1cf.c
index db79c06e1..1bde4af95 100644
--- a/examples/simple1d1cf.c
+++ b/examples/simple1d1cf.c
@@ -13,9 +13,9 @@ int main(int argc, char* argv[])
    with a math test. Single-precision version. C99 style. opts is a struct.
 
    Compile with:
-   gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lfftw3f_omp -lm -lstdc++
+   gcc -fopenmp example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lm -lstdc++
    or if you have built a single-core version:
-   gcc example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lfftw3f -lm -lstdc++
+   gcc example1d1cf.c -I../include ../lib-static/libfinufft.a -o example1d1cf -lm -lstdc++
 
    Usage: ./example1d1cf
 */
diff --git a/examples/simple1d1f.cpp b/examples/simple1d1f.cpp
index fea98b8d6..68f1c0259 100644
--- a/examples/simple1d1f.cpp
+++ b/examples/simple1d1f.cpp
@@ -15,9 +15,9 @@ int main(int argc, char* argv[])
    (See simple1d1 for double-precision version.)
 
    Compile with:
-   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lfftw3f_omp -lm
+   g++ -fopenmp simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f
    or if you have built a single-core version:
-   g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f -lfftw3f -lm
+   g++ simple1d1f.cpp -I../include ../lib-static/libfinufft.a -o simple1d1f
 
    Usage: ./simple1d1f
 */
diff --git a/examples/simple2d1.cpp b/examples/simple2d1.cpp
index cf912445b..b383aca40 100644
--- a/examples/simple2d1.cpp
+++ b/examples/simple2d1.cpp
@@ -14,9 +14,9 @@ int main(int argc, char *argv[]){
    arrays of C++ complex numbers, with a math test. Double precision version. 
 
    Compile multithreaded with
-   g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lfftw3_omp -lm
+   g++ -fopenmp simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1
    single core with:
-   g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1 -lfftw3 -lm
+   g++ simple2d1.cpp -I ../src ../lib-static/libfinufft.a -o simple2d1
    
    Usage:  ./simple2d1
 */
diff --git a/examples/simulplans1d1.cpp b/examples/simulplans1d1.cpp
index b814876a2..25e824f03 100644
--- a/examples/simulplans1d1.cpp
+++ b/examples/simulplans1d1.cpp
@@ -34,13 +34,12 @@ double chk1d1(int n, vector<double>& x, vector<complex<double>>& c,
 
 int main(int argc, char* argv[])
 /* Demo two simultaneous FINUFFT plans (A,B) being handled in C++ without
-   interacting (or at least without crashing; note that FFTW initialization
-   is the only global state of FINUFFT library).
+   interacting.
    Using STL double complex vectors, with a math test.
    Edited from guru1d1, Barnett 2/15/22
 
    Compile & run:
-   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 -lfftw3 -lfftw3_omp -lm && ./simulplans1d1
+   g++ -fopenmp simulplans1d1.cpp -I../include ../lib-static/libfinufft.a -o simulplans1d1 && ./simulplans1d1
 */
 {
   double tol = 1e-9;      // desired accuracy for both plans
diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp
index f25f25b8b..0dd4212c4 100644
--- a/examples/threadsafe1d1.cpp
+++ b/examples/threadsafe1d1.cpp
@@ -15,9 +15,6 @@ int main(int argc, char* argv[])
    Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test.
    Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp.
 
-   Notes: You may not have libfftw3_omp, so I have switched to
-   libfftw3_threads in this suggested compile command:
-
    g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1
 
    Usage: ./threadsafe1d1
diff --git a/fortran/examples/guru1d1.f b/fortran/examples/guru1d1.f
index 3f9c66827..3877a5c2c 100755
--- a/fortran/examples/guru1d1.f
+++ b/fortran/examples/guru1d1.f
@@ -6,7 +6,7 @@
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
       
 c     gfortran -fopenmp -I../../include -I/usr/include guru1d1.f
-c     ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++ -o guru1d1
+c     ../../lib/libfinufft.so -lgomp -lstdc++ -o guru1d1
 
 c     Alex Barnett and Libin Lu 5/29/20. ptr fixes 10/6/21
 
@@ -15,8 +15,6 @@ program guru1d1
 
 c     our fortran header, always needed
       include 'finufft.fh'
-c     if you want to use FFTW's modes by name...
-      include 'fftw3.f'
 
 c     note some inputs are int (int*4) but others BIGINT (int*8)
       integer ier,iflag
@@ -41,7 +39,7 @@ program guru1d1
       
 c     how many nonuniform pts
       M = 1000000
-c     how many modes (not too much since FFTW_MEASURE slow later)
+c     how many modes
       N = 100000
 
       allocate(fk(N))
@@ -104,8 +102,6 @@ program guru1d1
       print *,''
       print *, 'setting new options, rerun guru interface...'
       call finufft_default_opts(opts)
-c     refer to fftw3.f to set various FFTW plan modes...
-      opts%fftw = FFTW_ESTIMATE_PATIENT
       opts%debug = 1
 c     note you need a fresh plan if change opts
       call finufft_makeplan(ttype,dim,n_modes,iflag,ntrans,
diff --git a/fortran/examples/guru1d1f.f b/fortran/examples/guru1d1f.f
index 90d43174c..74508feaf 100755
--- a/fortran/examples/guru1d1f.f
+++ b/fortran/examples/guru1d1f.f
@@ -15,8 +15,6 @@ program guru1d1f
 
 c     our fortran-header, always needed
       include 'finufft.fh'
-c     if you want to use FFTW's modes by name...
-      include 'fftw3.f'
 
 c     note some inputs are int (int*4) but others BIGINT (int*8)
       integer ier,iflag
@@ -40,7 +38,7 @@ program guru1d1f
    
 c     how many nonuniform pts
       M = 200000
-c     how many modes (not too much since FFTW_MEASURE slow later)
+c     how many modes
       N = 100000
 
       allocate(fk(N))
@@ -103,8 +101,6 @@ program guru1d1f
       print *,''
       print *, 'setting new options, rerun guru interface...'
       call finufftf_default_opts(opts)
-c     refer to fftw3.f to set various FFTW plan modes...
-      opts%fftw = FFTW_ESTIMATE_PATIENT
       opts%debug = 1
 c     note you need a fresh plan if change opts
       call finufftf_makeplan(ttype,dim,n_modes,iflag,ntrans,
diff --git a/fortran/examples/nufft1d_demo.f b/fortran/examples/nufft1d_demo.f
index e63a7e434..9b16b266b 100755
--- a/fortran/examples/nufft1d_demo.f
+++ b/fortran/examples/nufft1d_demo.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static lib; paste to a single line):
 c
 c     gfortran nufft1d_demo.f ../directft/dirft1d.f -o nufft1d_demo
-c     ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp
+c     ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp
 c
       program nufft1d_demo
       implicit none
diff --git a/fortran/examples/nufft1d_demof.f b/fortran/examples/nufft1d_demof.f
index 13a40d601..24d7c4797 100755
--- a/fortran/examples/nufft1d_demof.f
+++ b/fortran/examples/nufft1d_demof.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static lib; paste to a single line):
 c
 c     gfortran nufft1d_demof.f ../directft/dirft1df.f -o nufft1d_demof
-c     ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp
+c     ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp
 c
       program nufft1d_demof
       implicit none
diff --git a/fortran/examples/nufft2d_demo.f b/fortran/examples/nufft2d_demo.f
index b37cfeffc..281112e4a 100755
--- a/fortran/examples/nufft2d_demo.f
+++ b/fortran/examples/nufft2d_demo.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static lib, paste to a single line):
 c
 c     gfortran nufft2d_demo.f ../directft/dirft2d.f -o nufft2d_demo
-c     ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp
+c     ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp
 c
       program nufft2d_demo
       implicit none
diff --git a/fortran/examples/nufft2d_demof.f b/fortran/examples/nufft2d_demof.f
index b649f6109..7192274de 100755
--- a/fortran/examples/nufft2d_demof.f
+++ b/fortran/examples/nufft2d_demof.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static lib, paste to a single line):
 c
 c     gfortran nufft2d_demof.f ../directft/dirft2df.f -o nufft2d_demof
-c     ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp
+c     ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp
 c
       program nufft2d_demof
       implicit none
diff --git a/fortran/examples/nufft2dmany_demo.f b/fortran/examples/nufft2dmany_demo.f
index 605237161..7a4928f41 100755
--- a/fortran/examples/nufft2dmany_demo.f
+++ b/fortran/examples/nufft2dmany_demo.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded; paste to a single line):
 c
 c     gfortran nufft2dmany_demo.f ../directft/dirft2d.f -o nufft2dmany_demo
-c     -L../../lib -lfinufft -lfftw3 -lfftw3_omp -lstdc++
+c     -L../../lib -lfinufft -lstdc++
 c
       program nufft2dmany_demo
       implicit none
diff --git a/fortran/examples/nufft3d_demo.f b/fortran/examples/nufft3d_demo.f
index af04afa5f..17c4074e6 100755
--- a/fortran/examples/nufft3d_demo.f
+++ b/fortran/examples/nufft3d_demo.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static, paste to a single line):
 c
 c     gfortran nufft3d_demo.f ../directft/dirft3d.f -o nufft3d_demo
-c     ../../lib-static/libfinufft.a -lstdc++ -lfftw3 -lfftw3_omp -lm -fopenmp
+c     ../../lib-static/libfinufft.a -lstdc++ -lm -fopenmp
 c
       program nufft3d_demo
       implicit none
diff --git a/fortran/examples/nufft3d_demof.f b/fortran/examples/nufft3d_demof.f
index 2e5a9e21f..6cc856574 100755
--- a/fortran/examples/nufft3d_demof.f
+++ b/fortran/examples/nufft3d_demof.f
@@ -12,7 +12,7 @@
 c     Compile with, eg (GCC, multithreaded, static, paste to a single line):
 c
 c     gfortran nufft3d_demof.f ../directft/dirft3df.f -o nufft3d_demof
-c     ../../lib-static/libfinufftf.a -lstdc++ -lfftw3f -lfftw3f_omp -lm -fopenmp
+c     ../../lib-static/libfinufftf.a -lstdc++ -lm -fopenmp
 c
       program nufft3d_demof
       implicit none
diff --git a/fortran/examples/simple1d1.f b/fortran/examples/simple1d1.f
index f187806ec..ee9a81f19 100755
--- a/fortran/examples/simple1d1.f
+++ b/fortran/examples/simple1d1.f
@@ -6,7 +6,7 @@
 c     To compile (linux/GCC) from this directory, use eg (paste to one line):
       
 c     gfortran -fopenmp -I../../include simple1d1.f -o simple1d1
-c     ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++
+c     ../../lib/libfinufft.so -lgomp -lstdc++
 
 c     Alex Barnett and Libin Lu 5/28/20, fix ptrs 10/6/21
 
diff --git a/fortran/examples/simple1d1.f90 b/fortran/examples/simple1d1.f90
index e1c6dcc2f..2368ad92f 100755
--- a/fortran/examples/simple1d1.f90
+++ b/fortran/examples/simple1d1.f90
@@ -6,7 +6,7 @@
 ! To compile (linux/GCC) from this directory, note the module also has to be
 ! compiled, eg:
       
-! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lfftw3 -lfftw3_omp -lgomp -lstdc++
+! gfortran -fopenmp ../../include/finufft_mod.f90 simple1d1.f90 -o simple1d1_f90 ../../lib/libfinufft.so -lgomp -lstdc++
 
 ! Alex Barnett, to demo Reinhard Neder f90 module, 1/20/23.
 
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index 77bc69b6b..f06710989 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -166,10 +166,6 @@
 // --------  FINUFFT's plan object, prec-switching version ------------------
 // NB: now private (the public C++ or C etc user sees an opaque pointer to it)
 
-// FFTW is needed since we include a FFTW plan in the FINUFFT plan...
-#include <finufft/fftw_defs.h>          // (must come after complex.h)
-// (other FFT lib headers eg MKL could be here...)
-
 // group together a bunch of type 3 rescaling/centering/phasing parameters:
 #define TYPE3PARAMS FINUFFTIFY(_type3Params)
 typedef struct {
@@ -186,7 +182,7 @@ typedef struct FINUFFT_PLAN_S {  // the main plan object, fully C++
   BIGINT nj;       // num of NU pts in type 1,2 (for type 3, num input x pts)
   BIGINT nk;       // number of NU freq pts (type 3 only)
   FLT tol;         // relative user tolerance
-  int batchSize;   // # strength vectors to group together for FFTW, etc
+  int batchSize;   // # strength vectors to group together for FFT, etc
   int nbatch;      // how many batches done to cover all ntrans vectors
   
   BIGINT ms;       // number of modes in x (1) dir (historical CMCL name) = N1
@@ -205,8 +201,7 @@ typedef struct FINUFFT_PLAN_S {  // the main plan object, fully C++
   FLT* phiHat2;    // " y-axis.
   FLT* phiHat3;    // " z-axis.
   
-  FFTW_CPX* fwBatch;    // (batches of) fine grid(s) for FFTW to plan
-                        // & act on. Usually the largest working array
+  CPX* fwBatch;    // FIXME: UNUSED, kept for layout compatibility
   
   BIGINT *sortIndices;  // precomputed NU pt permutation, speeds spread/interp
   bool didSort;         // whether binsorting used (false: identity perm used)
@@ -218,13 +213,12 @@ typedef struct FINUFFT_PLAN_S {  // the main plan object, fully C++
   FLT *S, *T, *U;  // pointers to user's target NU pts arrays (no new allocs)
   CPX* prephase;   // pre-phase, for all input NU pts
   CPX* deconv;     // reciprocal of kernel FT, phase, all output NU pts
-  CPX* CpBatch;    // working array of prephased strengths
+  CPX* CpBatch;    // FIXME: UNUSED, kept for layout compatibility
   FLT *Sp, *Tp, *Up;    // internal primed targs (s'_k, etc), allocated
   TYPE3PARAMS t3P; // groups together type 3 shift, scale, phase, parameters
   FINUFFT_PLAN innerT2plan;   // ptr used for type 2 in step 2 of type 3
   
   // other internal structs; each is C-compatible of course
-  FFTW_PLAN fftwPlan;
   finufft_opts opts;    // this and spopts could be made ptrs
   finufft_spread_opts spopts;
   
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
deleted file mode 100644
index 89d86f0de..000000000
--- a/include/finufft/fftw_defs.h
+++ /dev/null
@@ -1,48 +0,0 @@
-// all FFTW-related private FINUFFT headers
-
-#ifndef FFTW_DEFS_H
-#define FFTW_DEFS_H
-
-// Here we define typedefs and MACROS to switch between single and double
-// precision library compilation, which need different FFTW command symbols.
-// Barnett simplified via FFTWIFY, 6/7/22.
-
-#include <fftw3.h>          // (after complex.h) needed so can typedef FFTW_CPX
-
-// precision-switching names for interfaces to FFTW...
-#ifdef SINGLE
-  // macro to prepend fftw_ (for double) or fftwf_ (for single) to a string
-  // without a space. The 2nd level of indirection is needed for safety, see:
-  // https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting
-  #define FFTWIFY_UNSAFE(x) fftwf_##x
-#else
-  #define FFTWIFY_UNSAFE(x) fftw_##x
-#endif
-#define FFTWIFY(x) FFTWIFY_UNSAFE(x)
-// now use this tool (note we replaced typedefs v<=2.0.4, in favor of macros):
-#define FFTW_CPX FFTWIFY(complex)
-#define FFTW_PLAN FFTWIFY(plan)
-#define FFTW_ALLOC_RE FFTWIFY(alloc_real)
-#define FFTW_ALLOC_CPX FFTWIFY(alloc_complex)
-#define FFTW_PLAN_1D FFTWIFY(plan_dft_1d)
-#define FFTW_PLAN_2D FFTWIFY(plan_dft_2d)
-#define FFTW_PLAN_3D FFTWIFY(plan_dft_3d)
-#define FFTW_PLAN_MANY_DFT FFTWIFY(plan_many_dft)
-#define FFTW_EX FFTWIFY(execute)
-#define FFTW_DE FFTWIFY(destroy_plan)
-#define FFTW_FR FFTWIFY(free)
-#define FFTW_FORGET_WISDOM FFTWIFY(forget_wisdom)
-#define FFTW_CLEANUP FFTWIFY(cleanup)
-// the following OMP switch could be done in the src code instead...
-#ifdef _OPENMP
-  #define FFTW_INIT FFTWIFY(init_threads)
-  #define FFTW_PLAN_TH FFTWIFY(plan_with_nthreads)
-  #define FFTW_CLEANUP_THREADS FFTWIFY(cleanup_threads)
-#else
-  // no OMP (no fftw{f}_threads or _omp), need dummy fftw threads calls...
-  #define FFTW_INIT()
-  #define FFTW_PLAN_TH(x)
-  #define FFTW_CLEANUP_THREADS()
-#endif
-
-#endif  // FFTW_DEFS_H
diff --git a/include/finufft/finufft_eitherprec.h b/include/finufft/finufft_eitherprec.h
index f46272011..abbf7edfb 100644
--- a/include/finufft/finufft_eitherprec.h
+++ b/include/finufft/finufft_eitherprec.h
@@ -118,5 +118,3 @@ typedef struct FINUFFT_PLAN_S * FINUFFT_PLAN;
 #undef FINUFFT_PLAN
 #undef FINUFFT_PLAN_S
 #undef FINUFFT_TYPE3PARAMS
-#undef FINUFFT_FFTW_CPX
-#undef FINUFFT_FFTW_PLAN
diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
index 54b058266..cf53bb699 100644
--- a/include/finufft/test_defs.h
+++ b/include/finufft/test_defs.h
@@ -20,9 +20,6 @@
 #include <finufft/utils_precindep.h>
 // prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc...
 #include <finufft/defs.h>
-// since "many" (vector) tests need direct access to FFTW commands...
-// (although this now happens to be included in defs.h too)
-#include <finufft/fftw_defs.h>
 
 // std stuff for tester src
 #include <math.h>
diff --git a/include/finufft_opts.h b/include/finufft_opts.h
index 3a0156000..3d4da53af 100644
--- a/include/finufft_opts.h
+++ b/include/finufft_opts.h
@@ -20,7 +20,7 @@ typedef struct finufft_opts{  // defaults see finufft.cpp:finufft_default_opts()
 
   // algorithm performance opts...
   int nthreads;           // number of threads to use, or 0 uses all available
-  int fftw;               // plan flags to FFTW (FFTW_ESTIMATE=64, FFTW_MEASURE=0,...)
+  int fftw;               // FIXME: UNUSED, kept for layout compatibility
   int spread_sort;        // spreader: 0 don't sort, 1 do, or 2 heuristic choice
   int spread_kerevalmeth; // spreader: 0 exp(sqrt()), 1 Horner piecewise poly (faster)
   int spread_kerpad;      // (exp(sqrt()) only): 0 don't pad kernel to 4n, 1 do
diff --git a/make.inc.macosx_arm64 b/make.inc.macosx_arm64
index 8889fb964..50c42aee9 100644
--- a/make.inc.macosx_arm64
+++ b/make.inc.macosx_arm64
@@ -31,14 +31,11 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib -L/opt/homebrew/opt/libomp/lib
 # OpenMP with clang needs following...
 OMPFLAGS = -Xpreprocessor -fopenmp
 OMPLIBS = -L/usr/local/lib -L/opt/homebrew/lib -lomp
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 
 # MATLAB interface: this will probably segfault. Instead we suggest you use
 # make.inc.macosx_clang_matlab
 
-# Some of these will depend on your FFTW library location...
 MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
 # may need to edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
diff --git a/make.inc.macosx_arm64_matlab2022b_beta b/make.inc.macosx_arm64_matlab2022b_beta
index 762e55c3b..b8f685f77 100644
--- a/make.inc.macosx_arm64_matlab2022b_beta
+++ b/make.inc.macosx_arm64_matlab2022b_beta
@@ -37,12 +37,9 @@ OMPLIBS = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/toolbox/eml/extern
 # we need to use -Wl,-rpath to add iomp lib directory to the runtime library search path
 # add iomp runtime search path to linker flags
 LDFLAGS += -Wl,-rpath,$(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/toolbox/eml/externalDependency/omp/maca64/lib/
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 # MATLAB interface:
-# some of these will depend on your FFTW library location, but this is where
-# brew should put things...
+# this is where brew should put things...
 MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
 # should work, or edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/bin/mex
@@ -52,5 +49,3 @@ MEX = $(shell ls -d /Applications/MATLAB_R2022b_Beta.app)/bin/mex
 # extras by Wallace Chen:
 LDFLAGS += -L/opt/homebrew/opt/libomp/lib
 CXXFLAGS += -I/opt/homebrew/opt/libomp/include
-LDFLAGS += -L/opt/homebrew/opt/fftw/lib
-CXXFLAGS += -I/opt/homebrew/opt/fftw/include
diff --git a/make.inc.macosx_clang b/make.inc.macosx_clang
index 9ca4734d5..35149a26d 100644
--- a/make.inc.macosx_clang
+++ b/make.inc.macosx_clang
@@ -31,14 +31,11 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib
 # OpenMP with clang needs following...
 OMPFLAGS = -Xpreprocessor -fopenmp
 OMPLIBS = -L/usr/local/lib -L/usr/local/opt/libomp/lib -lomp
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 
 # MATLAB interface: this will probably segfault. Instead we suggest you use
 # make.inc.macosx_clang_matlab
 
-# Some of these will depend on your FFTW library location...
 MFLAGS += -I/usr/local/include -L/usr/local/lib -lm
 # may need to edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
diff --git a/make.inc.macosx_clang_matlab b/make.inc.macosx_clang_matlab
index c856d537b..e7853d6b1 100644
--- a/make.inc.macosx_clang_matlab
+++ b/make.inc.macosx_clang_matlab
@@ -34,12 +34,9 @@ OMPLIBS = $(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/libiomp5.d
 # we need to use -Wl,-rpath to add iomp lib directory to the runtime library search path
 # add iomp runtime search path to linker flags
 LDFLAGS += -Wl,-rpath,$(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 # MATLAB interface:
-# some of these will depend on your FFTW library location, but this is where
-# brew should put things...
+# this is where brew should put things...
 MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
 # should work, or edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
diff --git a/make.inc.macosx_gcc-10 b/make.inc.macosx_gcc-10
index 6329483ca..10edcc478 100644
--- a/make.inc.macosx_gcc-10
+++ b/make.inc.macosx_gcc-10
@@ -32,11 +32,8 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib
 # OpenMP with GCC on OSX needs following...
 OMPFLAGS = -fopenmp
 OMPLIBS = -L/usr/local/lib -lgomp
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 # MATLAB interface:
-# some of these will depend on your FFTW library location...
 MFLAGS += -I/usr/local/include  -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
 # edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
diff --git a/make.inc.macosx_gcc-8 b/make.inc.macosx_gcc-8
index 464c524c8..ac4a8b3ba 100644
--- a/make.inc.macosx_gcc-8
+++ b/make.inc.macosx_gcc-8
@@ -33,11 +33,8 @@ LIBS += -L/usr/local/lib -L/opt/homebrew/lib
 # OpenMP with GCC on OSX needs following...
 OMPFLAGS = -fopenmp
 OMPLIBS = -L/usr/local/lib -lgomp
-# since fftw3_omp doesn't work in OSX, we need...
-FFTWOMPSUFFIX=threads
 
 # MATLAB interface:
-# some of these will depend on your FFTW library location...
 MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
 # edit for your MATLAB version location...
 MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
diff --git a/make.inc.windows_mingw b/make.inc.windows_mingw
index e37a1436b..f712bf5fc 100644
--- a/make.inc.windows_mingw
+++ b/make.inc.windows_mingw
@@ -1,22 +1,9 @@
 MINGW=ON
 # libm not available on Windows? Has to be removed from LIBS to build MATLAB mex file. Does not interfere with library build
 LIBS= 
-# please set these paths
-FFTW_H_DIR=
-FFTW_LIB_DIR=
 # might be needed for MATLAB
 LGOMP_DIR=
 
-# modify FLAGS such that FFTW headers are included
-ifneq ($(FFTW_H_DIR),)
-CFLAGS+=-I$(FFTW_H_DIR)
-CXXFLAGS+=-I$(FFTW_H_DIR)
-endif
-# add FFTW DLL location to LIBS
-ifneq ($(FFTW_LIB_DIR),)
-LIBS+=-L$(FFTW_LIB_DIR)
-endif
-
 # adjust MATLAB flags, add path of lgomp
 ifneq ($(FFTW_H_DIR),)
 MFLAGS=-I$(FFTW_H_DIR) -largeArrayDims
diff --git a/makefile b/makefile
index 96a7a596a..31256b911 100644
--- a/makefile
+++ b/makefile
@@ -18,7 +18,7 @@
 CXX = g++
 CC = gcc
 FC = gfortran
-CLINK = -lstdc++
+CLINK = -lstdc++ -lm
 FLINK = $(CLINK)
 # Python version: we use python3 by default, but you may need to change...
 PYTHON = python3
@@ -29,11 +29,7 @@ PYTHON = python3
 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range $(CFLAGS)
 FFLAGS := $(CFLAGS) $(FFLAGS)
 CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
-# FFTW base name, and math linking...
-FFTWNAME = fftw3
-# linux default is fftw3_omp, since 10% faster than fftw3_threads...
-FFTWOMPSUFFIX = omp
-LIBS := -lm
+LIBS :=
 # multithreading for GCC: C++/C/Fortran, MATLAB, and octave (ICC differs)...
 OMPFLAGS = -fopenmp
 OMPLIBS = -lgomp
@@ -59,15 +55,10 @@ FINUFFT = $(dir $(realpath $(firstword $(MAKEFILE_LIST))))
 # Now come flags that should be added, whatever user overrode in make.inc.
 # -fPIC (position-indep code) needed to build dyn lib (.so)
 # Also, we force return (via :=) to the land of simply-expanded variables...
-INCL = -Iinclude
-CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++14
+INCL = -Iinclude -Icontrib
+CXXFLAGS := $(CXXFLAGS) $(INCL) -fPIC -std=c++17
 CFLAGS := $(CFLAGS) $(INCL) -fPIC
-# here /usr/include needed for fftw3.f "fortran header"... (JiriK: no longer)
-FFLAGS := $(FFLAGS) $(INCL) -I/usr/include -fPIC
-
-# single-thread total list of math and FFTW libs (now both precisions)...
-# (Note: finufft tests use LIBSFFT; spread & util tests only need LIBS)
-LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)f $(LIBS)
+FFLAGS := $(FFLAGS) $(INCL) -fPIC
 
 # multi-threaded libs & flags, and req'd flags (OO for new interface)...
 ifneq ($(OMP),OFF)
@@ -77,8 +68,6 @@ ifneq ($(OMP),OFF)
   MFLAGS += $(MOMPFLAGS) -DR2008OO
   OFLAGS += $(OOMPFLAGS) -DR2008OO
   LIBS += $(OMPLIBS)
-# omp override for total list of math and FFTW libs (now both precisions)...
-  LIBSFFT := -l$(FFTWNAME) -l$(FFTWNAME)_$(FFTWOMPSUFFIX) -l$(FFTWNAME)f -l$(FFTWNAME)f_$(FFTWOMPSUFFIX) $(LIBS)
 endif
 
 # name & location of library we're building...
@@ -108,7 +97,7 @@ OBJS = $(SOBJS) src/finufft.o src/simpleinterfaces.o fortran/finufftfort.o
 # their single-prec versions
 OBJSF = $(OBJS:%.o=%_32.o)
 # precision-dependent library object files (compiled & linked only once)...
-OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o
+OBJS_PI = $(SOBJS_PI) contrib/legendre_rule_fast.o contrib/ducc0/infra/string_utils.o contrib/ducc0/infra/threading.o
 # all lib dual-precision objs
 OBJSD = $(OBJS) $(OBJSF) $(OBJS_PI)
 
@@ -129,7 +118,7 @@ usage:
 	@echo " make octave - compile and test octave interfaces"
 	@echo " make python - compile and test python interfaces"
 	@echo " make all - do all the above (around 1 minute; assumes you have MATLAB, etc)"
-	@echo " make spreadtest - compile & run spreader-only tests (no FFTW)"
+	@echo " make spreadtest - compile & run spreader-only tests (no FFT)"
 	@echo " make spreadtestall - small set spreader-only tests for CI use"
 	@echo " make objclean - remove all object files, preserving libs & MEX"
 	@echo " make clean - also remove all lib, MEX, py, and demo executables"
@@ -175,14 +164,14 @@ endif
 $(DYNLIB): $(OBJSD)
 # using *absolute* path in the -o here is needed to make portable executables
 # when compiled against it, in mac OSX, strangely...
-	$(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBSFFT)
+	$(CXX) -shared ${LDFLAGS} $(OMPFLAGS) $(OBJSD) -o $(ABSDYNLIB) $(LIBS)
 ifeq ($(OMP),OFF)
 	@echo "$(DYNLIB) built, single-thread version"
 else
 	@echo "$(DYNLIB) built, multithreaded version"
 endif
 
-# here $(OMPFLAGS) and $(LIBSFFT) is even needed for linking under mac osx.
+# here $(OMPFLAGS) is even needed for linking under mac osx.
 # see: http://www.cprogramming.com/tutorial/shared-libraries-linux-gcc.html
 # Also note -l libs come after objects, as per modern GCC requirement.
 
@@ -210,19 +199,19 @@ endif
 examples/%: examples/%.o $(DYNLIB)
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) -o $@
 examples/%c: examples/%c.o $(DYNLIB)
-	$(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) $(CLINK) -o $@
+	$(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(CLINK) -o $@
 examples/%cf: examples/%cf.o $(DYNLIB)
-	$(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) $(CLINK) -o $@
+	$(CC) $(CFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(CLINK) -o $@
 
 
 # test (library validation) --------------------------------------------------
 # build (skipping .o) but don't run. Run with 'test' target
 # Note: both precisions use same sources; single-prec executables get f suffix.
-# generic tests link against our .so... (other libs needed for fftw_forget...)
+# generic tests link against our .so...
 test/%: test/%.cpp $(DYNLIB)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) -o $@
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBS) -o $@
 test/%f: test/%.cpp $(DYNLIB)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBS) -o $@
 # low-level tests that are cleaner if depend on only specific objects...
 test/testutils: test/testutils.cpp src/utils.o src/utils_precindep.o
 	$(CXX) $(CXXFLAGS) ${LDFLAGS} test/testutils.cpp src/utils.o src/utils_precindep.o $(LIBS) -o test/testutils
@@ -263,9 +252,9 @@ endif
 # perftest (performance/developer tests) -------------------------------------
 # generic perf test rules...
 perftest/%: perftest/%.cpp $(DYNLIB)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBSFFT) -o $@
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(ABSDYNLIB) $(LIBS) -o $@
 perftest/%f: perftest/%.cpp $(DYNLIB)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBSFFT) -o $@
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} -DSINGLE $< $(ABSDYNLIB) $(LIBS) -o $@
 
 # spreader only test, double/single (good for self-contained work on spreader)
 ST=perftest/spreadtestnd
@@ -310,7 +299,7 @@ gurutime: $(GTT) $(GTTF)
 
 # This was for a CCQ application... (zgemm was 10x faster! double-prec only)
 perftest/manysmallprobs: perftest/manysmallprobs.cpp $(STATICLIB)
-	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(STATICLIB) $(LIBSFFT) -o $@
+	$(CXX) $(CXXFLAGS) ${LDFLAGS} $< $(STATICLIB) -o $@
 	@echo "manysmallprobs: single-thread..."
 	OMP_NUM_THREADS=1 $@
 
@@ -349,11 +338,11 @@ fortran: $(FE)
 # matlab ----------------------------------------------------------------------
 # matlab .mex* executable... (matlab is so slow to start, not worth testing it)
 matlab: matlab/finufft.cpp $(STATICLIB)
-	$(MEX) $< $(STATICLIB) $(INCL) $(MFLAGS) $(LIBSFFT) -output matlab/finufft
+	$(MEX) $< $(STATICLIB) $(INCL) $(MFLAGS) -output matlab/finufft
 
 # octave .mex executable...
 octave: matlab/finufft.cpp $(STATICLIB)
-	(cd matlab; $(MKOCTFILE) --mex finufft.cpp -I../include ../$(STATICLIB) $(OFLAGS) $(LIBSFFT) -output finufft)
+	(cd matlab; $(MKOCTFILE) --mex finufft.cpp -I../include ../$(STATICLIB) $(OFLAGS) -output finufft)
 	@echo "Running octave interface tests; please wait a few seconds..."
 	(cd matlab ;\
 	$(OCTAVE) test/check_finufft.m ;\
@@ -374,7 +363,7 @@ endif
 
 # python ---------------------------------------------------------------------
 python: $(STATICLIB) $(DYNLIB)
-	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install -e ./python/finufft
+	FINUFFT_DIR=$(FINUFFT) $(PYTHON) -m pip -v install --break-system-packages -e ./python/finufft
 # note to devs: if trouble w/ NumPy, use: pip install ./python --no-deps
 	$(PYTHON) python/finufft/test/run_accuracy_tests.py
 	$(PYTHON) python/finufft/examples/simple1d1.py
@@ -435,7 +424,7 @@ endif
 objclean:
 ifneq ($(MINGW),ON)
   # non-Windows-WSL...
-	rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o
+	rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o contrib/ducc0/infra/*.o
 	rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o finufft_mod.mod
 else
   # Windows-WSL...
diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp
index 9524b4dda..f7a271c6a 100644
--- a/perftest/guru_timing_test.cpp
+++ b/perftest/guru_timing_test.cpp
@@ -134,9 +134,6 @@ int main(int argc, char* argv[])
   }
 
   // Andrea found the following are needed to get reliable independent timings:
-  FFTW_CLEANUP();
-  FFTW_CLEANUP_THREADS();
-  FFTW_FORGET_WISDOM();
   //std::this_thread::sleep_for(std::chrono::seconds(1));
   sleep(tsleep);
 
@@ -187,14 +184,6 @@ int main(int argc, char* argv[])
 
   // Comparing timing results with repeated calls to corresponding finufft function...
 
-  // The following would normally be done between independent timings, as found
-  // by Andrea Malleo, but in this case we need to access the plan later
-  // for many_simple_calls() to work, so we cannot do FFTW cleanup without
-  // apparently causing segfault :(. So we skip them.
-  //FFTW_CLEANUP();
-  //FFTW_CLEANUP_THREADS();
-  //FFTW_FORGET_WISDOM();
-  
   //std::this_thread::sleep_for(std::chrono::seconds(1)); if c++11 is allowed
   sleep(tsleep); //sleep for one second using linux sleep call
   
diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp
index c6776cf0e..1cf578cf8 100644
--- a/perftest/manysmallprobs.cpp
+++ b/perftest/manysmallprobs.cpp
@@ -16,7 +16,7 @@ int main(int argc, char* argv[])
    for Xi Chen question. Updated to also demo guru interface and compare speed.
    6/7/22 made deterministic changes so check answer matches both ways.
 
-   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs  -lfftw3 -lfftw3_omp -lm
+   g++ -fopenmp manysmallprobs.cpp ../lib-static/libfinufft.a -o manysmallprobs
    # multithreaded is much slower, due to overhead of starting threads?...
    export OMP_NUM_THREADS=1
    time ./manysmallprobs
diff --git a/perftest/timingBreakdowns.py b/perftest/timingBreakdowns.py
index 3abf92e17..33f6fe12c 100644
--- a/perftest/timingBreakdowns.py
+++ b/perftest/timingBreakdowns.py
@@ -123,7 +123,7 @@
                 totalTimeT3_Old.append(totalOldTime)
 
             #total time speedup
-            totalSpeedup = round(totalOldTime/totalNewTime,5)
+            totalSpeedup = round(totalOldTime/max(1e-19,totalNewTime),5)
 
             if(ftype == 1):
                 totalTimeT1Ratio.append(totalSpeedup)
@@ -201,7 +201,7 @@
             totalOldfftwPlan = round(totalOldfftwPlan,5)
             
             #These plan ratios include the initial old implementation plan construction!!
-            fftwPlanRatio = round(totalOldfftwPlan/new_fftwPlan,5)
+            fftwPlanRatio = round(totalOldfftwPlan/max(1e-19,new_fftwPlan),5)
             
             if(ftype == 1):
                 fftwPlanT1Ratio.append(fftwPlanRatio)
@@ -228,7 +228,7 @@
             #collect the fftw_exec timings for each trial of old
             totalOldfft = stm.sumAllTime("(.*fft \(\d+ threads\))(.*)",strOut) 
 
-            fftRatio = round(totalOldfft/new_fft,5)
+            fftRatio = round(totalOldfft/max(1e-19,new_fft),5)
             
             if(ftype == 1):
                 fftT1Ratio.append(fftRatio)
@@ -302,11 +302,11 @@
 ax1 = fig.add_subplot(221,projection='3d')
 
 if(totalTimeT1Ratio):
-    ax1.bar3d(t1x, t1y, zbot, widths, depths, totalTimeT1Ratio, shade=True, color='r', label='type1', alpha='1')
+    ax1.bar3d(t1x, t1y, zbot, widths, depths, totalTimeT1Ratio, shade=True, color='r', label='type1', alpha=1)
 if(totalTimeT2Ratio):
-    ax1.bar3d(t2x, t2y, zbot, widths, depths, totalTimeT2Ratio, shade=True, color='b', label='type2', alpha='1')
+    ax1.bar3d(t2x, t2y, zbot, widths, depths, totalTimeT2Ratio, shade=True, color='b', label='type2', alpha=1)
 if(totalTimeT3Ratio):
-    ax1.bar3d(t3x, t3y, zbot, widths, depths, totalTimeT3Ratio, shade=True, color='g', label='type3', alpha='1')
+    ax1.bar3d(t3x, t3y, zbot, widths, depths, totalTimeT3Ratio, shade=True, color='g', label='type3', alpha=1)
 
 ax1.legend([t1_proxy,t2_proxy,t3_proxy], ['type1','type2','type3'])
 
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 696500b76..57d864b98 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -7,7 +7,6 @@
 #include <finufft/utils.h>
 #include <finufft/utils_precindep.h>
 #include <finufft/spreadinterp.h>
-#include <finufft/fftw_defs.h>
 
 #include <iostream>
 #include <iomanip>
@@ -17,6 +16,8 @@
 #include <stdlib.h>
 #include <vector>
 #include "../contrib/legendre_rule_fast.h"
+#include "ducc0/fft/fft.h"
+#include "ducc0/fft/fftnd_impl.h"
 
 using namespace std;
 using namespace finufft;
@@ -45,8 +46,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
      3) deconvolve by division of each Fourier mode independently by the kernel
         Fourier series coeffs (not merely FFT of kernel), shuffle to output.
      The kernel coeffs are precomputed in what is called step 0 in the code.
-   Written with FFTW style complex arrays. Step 3a internally uses CPX,
-   and Step 3b internally uses real arithmetic and FFTW style complex.
 
    TYPE 2:
      The type 2 algorithm proceeds in three main steps:
@@ -54,8 +53,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
      2) compute inverse FFT on uniform fine grid
      3) spread (dir=2, ie interpolate) data to regular mesh
      The kernel coeffs are precomputed in what is called step 0 in the code.
-   Written with FFTW style complex arrays. Step 0 internally uses CPX,
-   and Step 1 internally uses real arithmetic and FFTW style complex.
 
    TYPE 3:
      The type 3 algorithm is basically a type 2 (which is implemented precisely
@@ -69,7 +66,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017:
        using quadrature of the kernel function times exponentials.
      iii) Shifts in x (real) and s (Fourier) are done to minimize the interval
        half-widths X and S, hence nf1.
-   No references to FFTW are needed here. CPX arithmetic is used.
 
    MULTIPLE STRENGTH VECTORS FOR THE SAME NONUNIFORM POINTS (n_transf>1):
      maxBatchSize (set to max_num_omp_threads) times the RAM is needed, so
@@ -83,7 +79,7 @@ Design notes for guru interface implementation:
   since that would only survive in the scope of each function.
 
 * Thread-safety: FINUFFT plans are passed as pointers, so it has no global
-  state apart from that associated with FFTW (and the did_fftw_init).
+  state,
 */
 
 
@@ -208,7 +204,7 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
   FLT f[MAX_NQUAD];
   double z[2*MAX_NQUAD], w[2*MAX_NQUAD];
   legendre_compute_glr(2*q,z,w);        // only half the nodes used, eg on (0,1)
-  std::complex<FLT> a[MAX_NQUAD];
+  CPX a[MAX_NQUAD];
   for (int n=0;n<q;++n) {               // set up nodes z_n and vals f_n
     z[n] *= J2;                         // rescale nodes
     f[n] = J2*(FLT)w[n] * evaluate_kernel((FLT)z[n], opts); // vals & quadr wei
@@ -222,7 +218,7 @@ void onedim_fseries_kernel(BIGINT nf, FLT *fwkerhalf, finufft_spread_opts opts)
 #pragma omp parallel num_threads(nt)
   {                                     // each thread gets own chunk to do
     int t = MY_OMP_GET_THREAD_NUM();
-    std::complex<FLT> aj[MAX_NQUAD];    // phase rotator for this thread
+    CPX aj[MAX_NQUAD];    // phase rotator for this thread
     for (int n=0;n<q;++n)
       aj[n] = pow(a[n],(FLT)brk[t]);    // init phase factors for chunk
     for (BIGINT j=brk[t];j<brk[t+1];++j) {          // loop along output array
@@ -278,7 +274,7 @@ void onedim_nuft_kernel(BIGINT nk, FLT *k, FLT *phihat, finufft_spread_opts opts
 }  
 
 void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
-			 FLT *fk, BIGINT nf1, FFTW_CPX* fw, int modeord)
+			 FLT *fk, BIGINT nf1, CPX* fw, int modeord)
 /*
   if dir==1: copies fw to fk with amplification by prefac/ker
   if dir==2: copies fk to fw (and zero pads rest of it), same amplification.
@@ -287,14 +283,13 @@ void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
           1: use FFT-style (from 0 to N/2-1, then -N/2 up to -1).
 
   fk is size-ms FLT complex array (2*ms FLTs alternating re,im parts)
-  fw is a FFTW style complex array, ie FLT [nf1][2], essentially FLTs
-       alternating re,im parts.
+  fw is a CPX [nf1] complex array.
   ker is real-valued FLT array of length nf1/2+1.
 
   Single thread only, but shouldn't matter since mostly data movement.
 
   It has been tested that the repeated floating division in this inner loop
-  only contributes at the <3% level in 3D relative to the fftw cost (8 threads).
+  only contributes at the <3% level in 3D relative to the FFT cost (8 threads).
   This could be removed by passing in an inverse kernel and doing mults.
 
   todo: rewrite w/ C++-complex I/O, check complex divide not slower than
@@ -310,30 +305,30 @@ void deconvolveshuffle1d(int dir,FLT prefac,FLT* ker, BIGINT ms,
   if (modeord==1) { pp = 0; pn = 2*(kmax+1); }   // or, instead, FFT ordering
   if (dir==1) {    // read fw, write out to fk...
     for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fk[pp++] = prefac * fw[k][0] / ker[k];          // re
-      fk[pp++] = prefac * fw[k][1] / ker[k];          // im
+      fk[pp++] = prefac * fw[k].real() / ker[k];          // re
+      fk[pp++] = prefac * fw[k].imag() / ker[k];          // im
     }
     for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fk[pn++] = prefac * fw[nf1+k][0] / ker[-k];     // re
-      fk[pn++] = prefac * fw[nf1+k][1] / ker[-k];     // im
+      fk[pn++] = prefac * fw[nf1+k].real() / ker[-k];     // re
+      fk[pn++] = prefac * fw[nf1+k].imag() / ker[-k];     // im
     }
   } else {    // read fk, write out to fw w/ zero padding...
     for (BIGINT k=kmax+1; k<nf1+kmin; ++k) {  // zero pad precisely where needed
-      fw[k][0] = fw[k][1] = 0.0; }
+      fw[k] = 0.0; }
     for (BIGINT k=0;k<=kmax;++k) {                    // non-neg freqs k
-      fw[k][0] = prefac * fk[pp++] / ker[k];          // re
-      fw[k][1] = prefac * fk[pp++] / ker[k];          // im
+      fw[k].real(prefac * fk[pp++] / ker[k]);          // re
+      fw[k].imag(prefac * fk[pp++] / ker[k]);          // im
     }
     for (BIGINT k=kmin;k<0;++k) {                     // neg freqs k
-      fw[nf1+k][0] = prefac * fk[pn++] / ker[-k];     // re
-      fw[nf1+k][1] = prefac * fk[pn++] / ker[-k];     // im
+      fw[nf1+k].real(prefac * fk[pn++] / ker[-k]);     // re
+      fw[nf1+k].imag(prefac * fk[pn++] / ker[-k]);     // im
     }
   }
 }
 
 void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 			 BIGINT ms, BIGINT mt,
-			 FLT *fk, BIGINT nf1, BIGINT nf2, FFTW_CPX* fw,
+			 FLT *fk, BIGINT nf1, BIGINT nf2, CPX* fw,
 			 int modeord)
 /*
   2D version of deconvolveshuffle1d, calls it on each x-line using 1/ker2 fac.
@@ -346,8 +341,7 @@ void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 
   fk is complex array stored as 2*ms*mt FLTs alternating re,im parts, with
     ms looped over fast and mt slow.
-  fw is a FFTW style complex array, ie FLT [nf1*nf2][2], essentially FLTs
-       alternating re,im parts; again nf1 is fast and nf2 slow.
+  fw is a complex array, ie CPX [nf1*nf2]; again nf1 is fast and nf2 slow.
   ker1, ker2 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1
        respectively.
 
@@ -361,7 +355,7 @@ void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   if (modeord==1) { pp = 0; pn = 2*(k2max+1)*ms; }  // or, instead, FFT ordering
   if (dir==2)               // zero pad needed x-lines (contiguous in memory)
     for (BIGINT j=nf1*(k2max+1); j<nf1*(nf2+k2min); ++j)  // sweeps all dims
-      fw[j][0] = fw[j][1] = 0.0;
+      fw[j] = 0.0;
   for (BIGINT k2=0;k2<=k2max;++k2, pp+=2*ms)          // non-neg y-freqs
     // point fk and fw to the start of this y value's row (2* is for complex):
     common::deconvolveshuffle1d(dir,prefac/ker2[k2],ker1,ms,fk + pp,nf1,&fw[nf1*k2],modeord);
@@ -372,7 +366,7 @@ void deconvolveshuffle2d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 			 FLT *ker3, BIGINT ms, BIGINT mt, BIGINT mu,
 			 FLT *fk, BIGINT nf1, BIGINT nf2, BIGINT nf3,
-			 FFTW_CPX* fw, int modeord)
+			 CPX* fw, int modeord)
 /*
   3D version of deconvolveshuffle2d, calls it on each xy-plane using 1/ker3 fac.
 
@@ -384,8 +378,7 @@ void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 
   fk is complex array stored as 2*ms*mt*mu FLTs alternating re,im parts, with
     ms looped over fastest and mu slowest.
-  fw is a FFTW style complex array, ie FLT [nf1*nf2*nf3][2], effectively
-       FLTs alternating re,im parts; again nf1 is fastest and nf3 slowest.
+  fw is a complex array, ie CPX [nf1*nf2*nf3]; again nf1 is fastest and nf3 slowest.
   ker1, ker2, ker3 are real-valued FLT arrays of lengths nf1/2+1, nf2/2+1,
        and nf3/2+1 respectively.
 
@@ -400,7 +393,7 @@ void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
   BIGINT np = nf1*nf2;  // # pts in an upsampled Fourier xy-plane
   if (dir==2)           // zero pad needed xy-planes (contiguous in memory)
     for (BIGINT j=np*(k3max+1);j<np*(nf3+k3min);++j)  // sweeps all dims
-      fw[j][0] = fw[j][1] = 0.0;
+      fw[j] = 0.0;
   for (BIGINT k3=0;k3<=k3max;++k3, pp+=2*ms*mt)      // non-neg z-freqs
     // point fk and fw to the start of this z value's plane (2* is for complex):
     common::deconvolveshuffle2d(dir,prefac/ker3[k3],ker1,ker2,ms,mt,
@@ -413,10 +406,10 @@ void deconvolveshuffle3d(int dir,FLT prefac,FLT *ker1, FLT *ker2,
 
 // --------- batch helper functions for t1,2 exec: ---------------------------
 
-int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
+int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX *fwBatch, CPX* cBatch)
 /*
   Spreads (or interpolates) a batch of batchSize strength vectors in cBatch
-  to (or from) the batch of fine working grids p->fwBatch, using the same set of
+  to (or from) the batch of fine working grids fwBatch, using the same set of
   (index-sorted) NU points p->X,Y,Z for each vector in the batch.
   The direction (spread vs interpolate) is set by p->spopts.spread_direction.
   Returns 0 (no error reporting for now).
@@ -435,7 +428,7 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
 #endif
 #pragma omp parallel for num_threads(nthr_outer)
   for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
+    CPX *fwi = fwBatch + i*p->nf;  // start of i'th fw array in wkspace
     CPX *ci = cBatch + i*p->nj;            // start of i'th c array in cBatch
     spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (FLT*)fwi, p->nj,
                        p->X, p->Y, p->Z, (FLT*)ci, p->spopts, p->didSort);
@@ -443,12 +436,12 @@ int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN p, CPX* cBatch)
   return 0;
 }
 
-int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
+int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX * fwBatch, CPX* fkBatch)
 /*
-  Type 1: deconvolves (amplifies) from each interior fw array in p->fwBatch
+  Type 1: deconvolves (amplifies) from each interior fw array in fwBatch
   into each output array fk in fkBatch.
   Type 2: deconvolves from user-supplied input fk to 0-padded interior fw,
-  again looping over fk in fkBatch and fw in p->fwBatch.
+  again looping over fk in fkBatch and fw in fwBatch.
   The direction (spread vs interpolate) is set by p->spopts.spread_direction.
   This is mostly a loop calling deconvolveshuffle?d for the needed dim batchSize
   times.
@@ -458,7 +451,7 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
   // since deconvolveshuffle?d are single-thread, omp par seems to help here...
 #pragma omp parallel for num_threads(batchSize)
   for (int i=0; i<batchSize; i++) {
-    FFTW_CPX *fwi = p->fwBatch + i*p->nf;  // start of i'th fw array in wkspace
+    CPX *fwi = fwBatch + i*p->nf;  // start of i'th fw array in wkspace
     CPX *fki = fkBatch + i*p->N;           // start of i'th fk array in fkBatch
     
     // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0...
@@ -482,14 +475,14 @@ int deconvolveBatch(int batchSize, FINUFFT_PLAN p, CPX* fkBatch)
 
 // since this func is local only, we macro its name here...
 #ifdef SINGLE
-#define GRIDSIZE_FOR_FFTW gridsize_for_fftwf
+#define GRIDSIZE_FOR_FFT gridsize_for_fftf
 #else
-#define GRIDSIZE_FOR_FFTW gridsize_for_fftw
+#define GRIDSIZE_FOR_FFT gridsize_for_fft
 #endif
 
-int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
+int* GRIDSIZE_FOR_FFT(FINUFFT_PLAN p){
 // local helper func returns a new int array of length dim, extracted from
-// the finufft plan, that fftw_plan_many_dft needs as its 2nd argument.
+// the finufft plan, that is needed for calling FFTs.
   int* nf;
   if(p->dim == 1){ 
     nf = new int[1];
@@ -499,7 +492,7 @@ int* GRIDSIZE_FOR_FFTW(FINUFFT_PLAN p){
     nf = new int[2];
     nf[0] = (int)p->nf2;
     nf[1] = (int)p->nf1; 
-  }   // fftw enforced row major ordering, ie dims are backwards ordered
+  }   // use row major ordering, ie dims are backwards ordered
   else{ 
     nf = new int[3];
     nf[0] = (int)p->nf3;
@@ -537,7 +530,7 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   o->showwarn = 1;
 
   o->nthreads = 0;
-  o->fftw = FFTW_ESTIMATE;
+  o->fftw = 0; // FIXME: unused
   o->spread_sort = 2;
   o->spread_kerevalmeth = 1;
   o->spread_kerpad = 1;
@@ -555,8 +548,8 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
 // Populates the fields of finufft_plan which is pointed to by "p".
 // opts is ptr to a finufft_opts to set options, or NULL to use defaults.
 // For some of the fields, if "auto" selected, choose the actual setting.
-// For types 1,2 allocates memory for internal working arrays,
-// evaluates spreading kernel coefficients, and instantiates the fftw_plan
+// For types 1,2 allocates memory for internal working arrays, and
+// evaluates spreading kernel coefficients
 {
   FINUFFT_PLAN p;
   cout << scientific << setprecision(15);  // for commented-out low-lev debug
@@ -646,20 +639,6 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
   //  ------------------------ types 1,2: planning needed ---------------------
   if (type==1 || type==2) {
 
-    int nthr_fft = nthr;    // give FFTW all threads (or use o.spread_thread?)
-                            // Note: batchSize not used since might be only 1.
-    // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT
-    // thread-safe (can be called inside OMP)
-    {
-      static bool did_fftw_init = false;    // the only global state of FINUFFT
-      std::lock_guard<std::mutex> lock(fftw_lock);
-      if (!did_fftw_init) {
-	FFTW_INIT();            // setup FFTW global state; should only do once
-	FFTW_PLAN_TH(nthr_fft); // ditto
-	did_fftw_init = true;   // ensure other FINUFFT threads don't clash
-      }
-    }
-
     p->spopts.spread_direction = type;
 
     if (p->opts.showwarn) {  // user warn round-off error...
@@ -704,44 +683,20 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
     if (dim>2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts);
     if (p->opts.debug) printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n",__func__,p->spopts.nspread, timer.elapsedsec());
 
-    timer.restart();
     p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
       fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
 
-    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace
-    if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch) {      // we don't catch all such mallocs, just this big one
-      fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__);
-      free(p->phiHat1); free(p->phiHat2); free(p->phiHat3);
-      return FINUFFT_ERR_ALLOC;
-    }
-   
-    timer.restart();            // plan the FFTW
-    int *ns = GRIDSIZE_FOR_FFTW(p);
-    // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags 
-    {
-      std::lock_guard<std::mutex> lock(fftw_lock);
-      p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf,
-                                       p->fftSign, p->opts.fftw);
-    }
-    if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec());
-    delete []ns;
-    
   } else {  // -------------------------- type 3 (no planning) ------------
 
     if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n",__func__,dim,type,ntrans);
     // in case destroy occurs before setpts, need safe dummy ptrs/plans...
-    p->CpBatch = NULL;
-    p->fwBatch = NULL;
     p->Sp = NULL; p->Tp = NULL; p->Up = NULL;
     p->prephase = NULL;
     p->deconv = NULL;
     p->innerT2plan = NULL;
-    // Type 3 will call finufft_makeplan for type 2; no need to init FFTW
-    // Note we don't even know nj or nk yet, so can't do anything else!
   }
   return ier;         // report setup_spreader status (could be warning)
 }
@@ -826,19 +781,6 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
       fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
       return FINUFFT_ERR_MAXNALLOC;
     }
-    if (p->fwBatch)
-      FFTW_FR(p->fwBatch);
-    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace
-
-    // (note FFTW_ALLOC is not needed over malloc, but matches its type)
-    if(p->CpBatch) free(p->CpBatch);
-    p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize);  // batch c' work
-    if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec());
-    if(!p->fwBatch || !p->CpBatch) {
-      fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__);
-      return FINUFFT_ERR_ALLOC; 
-    }
-    //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
     if(p->X) free(p->X);
@@ -988,13 +930,14 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
    existing (sorted) NU pts and existing plan.
    For type 1 and 3: cj is input, fk is output.
    For type 2: fk is input, cj is output.
-   Performs spread/interp, pre/post deconvolve, and fftw_execute as appropriate
+   Performs spread/interp, pre/post deconvolve, and FFTs as appropriate
    for each of the 3 types.
    For cases of ntrans>1, performs work in blocks of size up to batchSize.
    Return value 0 (no error diagnosis yet).
    Barnett 5/20/20, based on Malleo 2019.
 */
   CNTime timer; timer.start();
+  std::vector<CPX> fwBatch(p->nf * p->batchSize);    // the big workspace
   
   if (p->type!=3){ // --------------------- TYPE 1,2 EXEC ------------------
   
@@ -1014,27 +957,110 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
       // STEP 1: (varies by type)
       timer.restart();
       if (p->type == 1) {  // type 1: spread NU pts p->X, weights cj, to fw grid
-        spreadinterpSortedBatch(thisBatchSize, p, cjb);
+        spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), cjb);
         t_sprint += timer.elapsedsec();
       } else {          //  type 2: amplify Fourier coeffs fk into 0-padded fw
-        deconvolveBatch(thisBatchSize, p, fkb);
+        deconvolveBatch(thisBatchSize, p, fwBatch.data(), fkb);
         t_deconv += timer.elapsedsec();
       }
              
-      // STEP 2: call the pre-planned FFT on this batch
+      // STEP 2: call the FFT on this batch
       timer.restart();
-      FFTW_EX(p->fftwPlan);   // if thisBatchSize<batchSize it wastes some flops
+      {
+      int *ns = GRIDSIZE_FOR_FFT(p);
+      vector<size_t> arrdims, axes;
+      arrdims.push_back(size_t(p->batchSize));
+      arrdims.push_back(size_t(ns[0])); axes.push_back(1);
+      if (p->dim>=2) { arrdims.push_back(size_t(ns[1])); axes.push_back(2); }
+      if (p->dim>=3) { arrdims.push_back(size_t(ns[2])); axes.push_back(3); }
+      ducc0::vfmav<CPX> data(fwBatch.data(), arrdims);
+      if (p->dim==1)  // 1D: no chance for FFT shortcuts
+        ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads);
+      else if (p->dim==2)  // 2D: do partial FFTs
+        {
+        if (p->ms<2)  // something is weird, do standard FFT
+          ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads);
+        else
+          {
+          size_t y_lo = size_t((p->ms+1)/2);
+          size_t y_hi = size_t(ns[1]-p->ms/2);
+          auto sub1 = ducc0::subarray(data, {{},{},{0,y_lo}});
+          auto sub2 = ducc0::subarray(data, {{},{},{y_hi,ducc0::MAXIDX}});
+          if (p->type == 1)  // spreading, not all parts of the output array are needed
+            {
+            // do axis 2 in full
+            ducc0::c2c(data, data, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do only parts of axis 1
+            ducc0::c2c(sub1, sub1, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub2, sub2, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            }
+          else  // interpolation, parts of the input array are zero
+            {
+            // do only parts of axis 1
+            ducc0::c2c(sub1, sub1, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub2, sub2, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do axis 2 in full
+            ducc0::c2c(data, data, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            }
+          }
+        }
+      else // 3D
+        {
+        if ((p->ms<2) || (p->mt<2))  // something is weird, do standard FFT
+          ducc0::c2c(data, data, axes, p->fftSign<0, FLT(1), p->opts.nthreads);
+        else
+          {
+          size_t z_lo = size_t((p->ms+1)/2);
+          size_t z_hi = size_t(ns[2]-p->ms/2);
+          size_t y_lo = size_t((p->mt+1)/2);
+          size_t y_hi = size_t(ns[1]-p->mt/2);
+          auto sub1 = ducc0::subarray(data, {{},{},{},{0,z_lo}});
+          auto sub2 = ducc0::subarray(data, {{},{},{},{z_hi,ducc0::MAXIDX}});
+          auto sub3 = ducc0::subarray(sub1, {{},{},{0,y_lo},{}});
+          auto sub4 = ducc0::subarray(sub1, {{},{},{y_hi, ducc0::MAXIDX},{}});
+          auto sub5 = ducc0::subarray(sub2, {{},{},{0,y_lo},{}});
+          auto sub6 = ducc0::subarray(sub2, {{},{},{y_hi, ducc0::MAXIDX},{}});
+          if (p->type == 1)  // spreading, not all parts of the output array are needed
+            {
+            // do axis 3 in full
+            ducc0::c2c(data, data, {3}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do only parts of axis 2
+            ducc0::c2c(sub1, sub1, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub2, sub2, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do even smaller parts of axis 1
+            ducc0::c2c(sub3, sub3, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub4, sub4, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub5, sub5, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub6, sub6, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            }
+          else  // interpolation, parts of the input array are zero
+            {
+            // do even smaller parts of axis 1
+            ducc0::c2c(sub3, sub3, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub4, sub4, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub5, sub5, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub6, sub6, {1}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do only parts of axis 2
+            ducc0::c2c(sub1, sub1, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            ducc0::c2c(sub2, sub2, {2}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            // do axis 3 in full
+            ducc0::c2c(data, data, {3}, p->fftSign<0, FLT(1), p->opts.nthreads);
+            }
+          }
+        }
+      delete[] ns;
+      }
       t_fft += timer.elapsedsec();
       if (p->opts.debug>1)
-        printf("\tFFTW exec:\t\t%.3g s\n", timer.elapsedsec());
+        printf("\tFFT exec:\t\t%.3g s\n", timer.elapsedsec());
       
       // STEP 3: (varies by type)
       timer.restart();        
       if (p->type == 1) {   // type 1: deconvolve (amplify) fw and shuffle to fk
-        deconvolveBatch(thisBatchSize, p, fkb);
+        deconvolveBatch(thisBatchSize, p, fwBatch.data(), fkb);
         t_deconv += timer.elapsedsec();
       } else {          // type 2: interpolate unif fw grid to NU target pts
-        spreadinterpSortedBatch(thisBatchSize, p, cjb);
+        spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), cjb);
         t_sprint += timer.elapsedsec(); 
       }
     }                                                   // ........end b loop
@@ -1060,6 +1086,8 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
     if (p->opts.debug)
       printf("[%s t3] start ntrans=%d (%d batches, bsize=%d)...\n",__func__,p->ntrans, p->nbatch, p->batchSize);
 
+    std::vector<CPX> CpBatch(p->nj*p->batchSize);  // batch c' work
+
     for (int b=0; b*p->batchSize < p->ntrans; b++) { // .....loop b over batches
 
       // batching and pointers to this batch, identical to t1,2 above...
@@ -1075,25 +1103,25 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
       for (int i=0; i<thisBatchSize; i++) {
         BIGINT ioff = i*p->nj;
         for (BIGINT j=0;j<p->nj;++j)
-          p->CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j];
+          CpBatch[ioff+j] = p->prephase[j] * cjb[ioff+j];
       }
       t_pre += timer.elapsedsec(); 
       
       // STEP 1: spread c'_j batch (x'_j NU pts) into fw batch grid...
       timer.restart();
       p->spopts.spread_direction = 1;                         // spread
-      spreadinterpSortedBatch(thisBatchSize, p, p->CpBatch);  // p->X are primed
+      spreadinterpSortedBatch(thisBatchSize, p, fwBatch.data(), CpBatch.data());  // p->X are primed
       t_spr += timer.elapsedsec();
 
-      //for (int j=0;j<p->nf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,p->fwBatch[j][0],p->fwBatch[j][1]);  // debug
+      //for (int j=0;j<p->nf1;++j) printf("fw[%d]=%.3g+%.3gi\n",j,fwBatch[j].real(),fwBatch[j].imag());  // debug
    
       // STEP 2: type 2 NUFFT from fw batch to user output fk array batch...
       timer.restart();
       // illegal possible shrink of ntrans *after* plan for smaller last batch:
       p->innerT2plan->ntrans = thisBatchSize;      // do not try this at home!
-      /* (alarming that FFTW not shrunk, but safe, because t2's fwBatch array
+      /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array
          still the same size, as Andrea explained; just wastes a few flops) */
-      FINUFFT_EXECUTE(p->innerT2plan, fkb, (CPX*)(p->fwBatch));
+      FINUFFT_EXECUTE(p->innerT2plan, fkb, fwBatch.data());
       t_t2 += timer.elapsedsec();
 
       // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)...
@@ -1115,7 +1143,7 @@ int FINUFFT_EXECUTE(FINUFFT_PLAN p, CPX* cj, CPX* fk){
     }    
   }
   //for (BIGINT k=0;k<10;++k) printf("\tfk[%ld]=%.15g+%.15gi\n",(long int)k,(double)real(fk[k]),(double)imag(fk[k]));  // debug
-  
+
   return 0; 
 }
 
@@ -1130,19 +1158,13 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
   if (!p)                // NULL ptr, so not a ptr to a plan, report error
     return 1;
 
-  FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
   if (p->type==1 || p->type==2) {
-    {
-      std::lock_guard<std::mutex> lock(fftw_lock);
-      FFTW_DE(p->fftwPlan);
-    }
     free(p->phiHat1);
     free(p->phiHat2);
     free(p->phiHat3);
   } else {               // free the stuff alloc for type 3 only
     FINUFFT_DESTROY(p->innerT2plan);   // if NULL, ignore its error code
-    free(p->CpBatch);
     free(p->Sp); free(p->Tp); free(p->Up);
     free(p->X); free(p->Y); free(p->Z);
     free(p->prephase);
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index ba3a3f328..830fa9011 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -23,11 +23,11 @@
    Made pass-fail, obviating results/dumbinputs.refout. Barnett 6/16/23.
 
    Suggested compile:
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lfftw3_omp -lm
-   g++ -std=c++14 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -lfftw3 -lfftw3_omp -lm -DSINGLE
+   g++ -std=c++17 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs
+   g++ -std=c++17 -fopenmp dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputsf -DSINGLE
 
    or if you have built a single-core version:
-   g++ -std=c++14 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs -lfftw3 -lm
+   g++ -std=c++17 dumbinputs.cpp -I../include ../lib/libfinufft.so -o dumbinputs
    etc
 */
 
diff --git a/test/finufft1d_test.cpp b/test/finufft1d_test.cpp
index 8dd345b1a..90a0aaaba 100644
--- a/test/finufft1d_test.cpp
+++ b/test/finufft1d_test.cpp
@@ -19,7 +19,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;         // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);  // put defaults in opts
-  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
   int isign = +1;            // choose which exponential sign to test
   if (argc<3 || argc>8) {
     for (int i=0; help[i]; ++i)
diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp
index 581c52c2d..b2cff678a 100644
--- a/test/finufft1dmany_test.cpp
+++ b/test/finufft1dmany_test.cpp
@@ -20,7 +20,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;          // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
   int isign = +1;             // choose which exponential sign to test
   if (argc<4 || argc>11) {
     for (int i=0; help[i]; ++i)
@@ -81,7 +80,6 @@ int main(int argc, char* argv[])
 	 (long long)nt1,i,err);
 
   // compare the result with FINUFFT1D1
-  FFTW_FORGET_WISDOM();
   CPX * F_1d1 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
   CPX * Fstart;
   CPX * cstart;
@@ -112,7 +110,6 @@ int main(int argc, char* argv[])
 
 
   printf("test 1d2 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
 
 #pragma omp parallel
   {
@@ -141,7 +138,6 @@ int main(int argc, char* argv[])
   printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
 
   // check against single calls to FINUFFT1D2...
-  FFTW_FORGET_WISDOM();
   CPX * c_1d2 = (CPX *)malloc(sizeof(CPX)*M*ntransf);
   timer.restart();
   for(BIGINT j = 0; j < ntransf; j++){
@@ -165,7 +161,6 @@ int main(int argc, char* argv[])
   free(c_1d2);
 
   printf("test 1d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
 
 #pragma omp parallel
   {
@@ -206,7 +201,6 @@ int main(int argc, char* argv[])
   printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
 
   // compare the result with single calls to FINUFFT1D3...
-  FFTW_FORGET_WISDOM();
   CPX *f_1d3 = (CPX *)malloc(sizeof(CPX)*N*ntransf);
   timer.restart();
   for(int k = 0; k < ntransf; k++){
diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp
index 04945b5f9..f2e02aaa6 100644
--- a/test/finufft2d_test.cpp
+++ b/test/finufft2d_test.cpp
@@ -19,7 +19,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;          // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
   int isign = +1;             // choose which exponential sign to test
   if (argc<4 || argc>9) {
     for (int i=0; help[i]; ++i)
diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp
index 31b65378e..e79a8613c 100644
--- a/test/finufft2dmany_test.cpp
+++ b/test/finufft2dmany_test.cpp
@@ -20,7 +20,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;          // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from default FFTW_ESTIMATE
   int isign = +1;                // choose which exponential sign to test
   if (argc<5 || argc>12) {
     for (int i=0; help[i]; ++i)
@@ -85,7 +84,6 @@ int main(int argc, char* argv[])
 	 (long long)nt1,(long long)nt2,i,err);
 
   // compare the result with FINUFFT2D1
-  FFTW_FORGET_WISDOM();
   finufft_opts simpleopts = opts;
   simpleopts.debug = 0;       // don't output timing for calls of FINUFFT2D1
   simpleopts.spread_debug = 0;
@@ -125,7 +123,6 @@ int main(int argc, char* argv[])
     for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
   }
 
-  FFTW_FORGET_WISDOM();
   timer.restart();
   ier = FINUFFT2D2MANY(ntransf,M,x,y,c,isign,tol,N1,N2,F,&opts);
   ti=timer.elapsedsec();
@@ -135,7 +132,6 @@ int main(int argc, char* argv[])
   } else
     printf("ntr=%d: (%lld,%lld) modes to %lld NU pts in %.3g s \t%.3g NU pts/s\n", ntransf,(long long)N1,(long long)N2,(long long)M,ti,ntransf*M/ti);
 
-  FFTW_FORGET_WISDOM();
   i = ntransf-1;   // choose a data to check
   BIGINT jt = M/2;    // check arbitrary choice of one targ pt
   CPX ct = CPX(0,0);
@@ -172,7 +168,6 @@ int main(int argc, char* argv[])
   free(c_2d2);
 
   printf("test 2d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
   
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
@@ -219,7 +214,6 @@ int main(int argc, char* argv[])
   printf("\tone targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
 
 // compare the result with FINUFFT2D3...
-  FFTW_FORGET_WISDOM();
   CPX* f_2d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
   timer.restart();
   for (int k=0; k<ntransf; ++k)
diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp
index 29dba95d0..7d1ef5699 100644
--- a/test/finufft3d_test.cpp
+++ b/test/finufft3d_test.cpp
@@ -19,7 +19,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;       // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  //opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
   //opts.spread_max_sp_size = 3e4; // override test
   //opts.spread_nthr_atomic = 15;  // "
   int isign = +1;             // choose which exponential sign to test
diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp
index d427555c3..2a117a323 100644
--- a/test/finufft3dmany_test.cpp
+++ b/test/finufft3dmany_test.cpp
@@ -20,7 +20,6 @@ int main(int argc, char* argv[])
   double w, tol = 1e-6;          // default
   double err, errfail = INFINITY, errmax = 0;
   finufft_opts opts; FINUFFT_DEFAULT_OPTS(&opts);
-  // opts.fftw = FFTW_MEASURE;  // change from usual FFTW_ESTIMATE
   int isign = +1;             // choose which exponential sign to test
   if (argc<6 || argc>13) {
     for (int i=0; help[i]; ++i)
@@ -89,7 +88,6 @@ int main(int argc, char* argv[])
 	 (long long)nt1,(long long)nt2,(long long)nt3,i,err);
 
   // compare the result with FINUFFT3D1
-  FFTW_FORGET_WISDOM();
   finufft_opts simpleopts=opts;
   simpleopts.debug = 0;       // don't output timing for calls of FINUFFT3D1
   simpleopts.spread_debug = 0;
@@ -128,7 +126,6 @@ int main(int argc, char* argv[])
 #pragma omp for schedule(static,TEST_RANDCHUNK)
     for (BIGINT m=0; m<N*ntransf; ++m) F[m] = crandm11r(&se);
   }
-  FFTW_FORGET_WISDOM();
   timer.restart();
   ier = FINUFFT3D2MANY(ntransf,M,x,y,z,c,isign,tol,N1,N2,N3,F,&opts);
   ti=timer.elapsedsec();
@@ -153,7 +150,6 @@ int main(int argc, char* argv[])
   errmax = max(err,errmax);
   printf("\tone targ: rel err in c[%lld] of trans#%d is %.3g\n",(long long)jt,i,err);
 
-  FFTW_FORGET_WISDOM();
   // compare the result with FINUFFT3D2...
   CPX* c_3d2 = (CPX*)malloc(sizeof(CPX)*M*ntransf);
   timer.restart();
@@ -180,7 +176,6 @@ int main(int argc, char* argv[])
 
 
   printf("test 3d3 many vs repeated single: ------------------------------------\n");
-  FFTW_FORGET_WISDOM();
   // reuse the strengths c, interpret N as number of targs:
 #pragma omp parallel
   {
@@ -228,7 +223,6 @@ int main(int argc, char* argv[])
   errmax = max(err,errmax);
   printf("\t one targ: rel err in F[%lld] of trans#%d is %.3g\n",(long long)kt,i,err);
 
-  FFTW_FORGET_WISDOM();
 // compare the result with FINUFFT3D3...
   CPX* f_3d3 = (CPX*)malloc(sizeof(CPX)*N*ntransf);
   timer.restart();
diff --git a/test/testutils.cpp b/test/testutils.cpp
index cd2cd7bef..40a4254a7 100644
--- a/test/testutils.cpp
+++ b/test/testutils.cpp
@@ -9,8 +9,8 @@
    and platform-indep, than having to compare the text output)
 
    Suggested compile (double/float versions):
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils.o ../src/utils_precindep.o -o testutils -lgomp
-   g++ -std=c++14 -fopenmp testutils.cpp -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
+   g++ -std=c++17 -fopenmp testutils.cpp -I../include ../src/utils.o ../src/utils_precindep.o -o testutils -lgomp
+   g++ -std=c++17 -fopenmp testutils.cpp -I../include ../src/utils_32.o ../src/utils_precindep.o -o testutilsf -lgomp -DSINGLE
 */
 
 // This switches FLT macro from double to float if SINGLE is defined, etc...
diff --git a/tools/finufft/docker/Dockerfile-x86_64 b/tools/finufft/docker/Dockerfile-x86_64
index 7cdec13b2..001d1bfc7 100644
--- a/tools/finufft/docker/Dockerfile-x86_64
+++ b/tools/finufft/docker/Dockerfile-x86_64
@@ -1,28 +1,6 @@
-# We currently use manylinux2010 based on CentOS6, which has very old
-# fftw 3.2.1, too old for FINUFFT. We thus here compile FFTW from source (slow).
-#
-# Soon (11/30/2020) we'll want to update to manylinux2014 which has fftw
-# 3.3.3 (still old,
-# but functions with FINUFFT), and switch to: yum install fft3-devel
-# instead of building from source.
-
-
 FROM quay.io/pypa/manylinux2010_x86_64
 LABEL maintainer "Libin Lu"
 
 RUN set -e -x
-RUN cd ~; \
-curl http://www.fftw.org/fftw-3.3.8.tar.gz --output fftw-3.3.8.tar.gz; \
-tar -xvzf fftw-3.3.8.tar.gz; \
-cd fftw-3.3.8; \
-export CFLAGS=-fPIC; \
-./configure --enable-threads --enable-openmp; \
-make; \
-make install; \
-make clean; \
-export CFLAGS=-fPIC; \
-./configure --enable-threads --enable-openmp --enable-float; \
-make; \
-make install;
 
 CMD ["/bin/bash"]