diff --git a/.github/workflows/python_build_win.ps1 b/.github/workflows/python_build_win.ps1
index fcb3d611f..072413d42 100644
--- a/.github/workflows/python_build_win.ps1
+++ b/.github/workflows/python_build_win.ps1
@@ -41,6 +41,8 @@ Copy-Item -Path C:\msys64\mingw64\bin\libgomp-*.dll -Destination ([IO.Path]::Com
 Copy-Item -Path C:\msys64\mingw64\bin\libwinpthread-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 Copy-Item -Path C:\msys64\mingw64\bin\libfftw3-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
+Copy-Item -Path C:\msys64\mingw64\bin\libfftw3_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
+Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
 New-Item -Path .\wheelhouse -ItemType Directory -Force
 wheel.exe pack $unpacked_wheel -d .\wheelhouse
 if (-not $?) {throw "Failed pack wheel"}
diff --git a/.github/workflows/python_test_win.ps1 b/.github/workflows/python_test_win.ps1
index 3e77ef049..949a6906d 100644
--- a/.github/workflows/python_test_win.ps1
+++ b/.github/workflows/python_test_win.ps1
@@ -1,6 +1,9 @@
-python -m pip install finufft -f .\wheelhouse\
+python -m pip install --pre finufft -f .\wheelhouse\
 if (-not $?) {throw "Failed to pip install finufft"}
 python python/finufft/test/run_accuracy_tests.py
 if (-not $?) {throw "Tests failed"}
 python python/finufft/examples/simple1d1.py
 if (-not $?) {throw "Simple1d1 test failed"}
+python -m pip install pytest
+python -m pytest python/finufft/test
+if (-not $?) {throw "Pytest suite failed"}
diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml
index 095653cc7..fd0db91bb 100644
--- a/.github/workflows/python_wheel.yml
+++ b/.github/workflows/python_wheel.yml
@@ -34,7 +34,7 @@ jobs:
       uses: actions/upload-artifact@v2
       with:
         name: linux-wheels
-        path: python/wheelhouse/finufft*manylinux*.whl
+        path: python/finufft/wheelhouse/finufft*manylinux*.whl
 
   MacOS:
     runs-on: macos-latest
@@ -60,19 +60,6 @@ jobs:
         # hack to make libquadmath link statically
         sudo rm /usr/local/opt/gcc@11/lib/gcc/11/libquadmath.*dylib
 
-    - name: Cache macOS Python binaries
-      id: cache-python
-      uses: actions/cache@v3
-      with:
-        path: |
-          /Library/Frameworks/Python.framework/Versions/3.6
-          /Library/Frameworks/Python.framework/Versions/3.7
-          /Library/Frameworks/Python.framework/Versions/3.8
-          /Library/Frameworks/Python.framework/Versions/3.9
-          /Library/Frameworks/Python.framework/Versions/3.10
-          /Library/Frameworks/Python.framework/Versions/3.11
-        key: macos-python-3.6.8-macosx10.9-python3.7.9-macosx10.9-python3.8.3-macosx10.9-python3.9.7-macos11-python3.10.1-macos11-python3.11.0-macos11
-
     # Download and install Python instead of using the setup_python
     # as the python interpreters in the Github machines
     # were compiled in 10.14, the wheels built with them
@@ -138,30 +125,42 @@ jobs:
         PYTHON_BIN=/Library/Frameworks/Python.framework/Versions/3.11/bin/
         $PYTHON_BIN/python3 -m pip install delocate
         ls wheelhouse/finufft*.whl | xargs -n1 $PYTHON_BIN/delocate-wheel -w fixed_wheel/
-        /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 examples/simple1d1.py
-        /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pytest test
+        /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 examples/simple1d1.py
-        /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pytest test
+        /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 examples/simple1d1.py
-        /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pytest test
+        /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 examples/simple1d1.py
-        /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pytest test
+        /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 examples/simple1d1.py
-        /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install finufft -f fixed_wheel/
+        /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pytest test
+        /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install --pre finufft -f fixed_wheel/
         /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 test/run_accuracy_tests.py
         /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 examples/simple1d1.py
+        /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install pytest
+        /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pytest test
 
     - name: Upload wheels
       uses: actions/upload-artifact@v2
       with:
         name: macos-wheels
-        path: python/fixed_wheel/*.whl
+        path: python/finufft/fixed_wheel/*.whl
 
   Windows:
     runs-on: windows-latest
diff --git a/.readthedocs.yml b/.readthedocs.yaml
similarity index 79%
rename from .readthedocs.yml
rename to .readthedocs.yaml
index dc748b7c0..0f63d4f34 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yaml
@@ -1,12 +1,16 @@
 # .readthedocs.yml
 # "Read the Docs" doc-hosting website configuration file
 # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-#
-# Barnett 10/5/20.
 
 # Required (for this file format)
 version: 2
 
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    
 # Build all formats
 formats: all
 
@@ -16,6 +20,5 @@ sphinx:
 
 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: 3.7
   install:
     - requirements: docs/requirements.txt
diff --git a/CHANGELOG b/CHANGELOG
index e2932f637..9aa0bf1c1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,12 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (up to v 1.3 CUFINUFFT indicated separately).
 
-V 2.2.0 (not yet released... planning summer 2023):
+V 2.3.0 (expected Dec 2023)
+
+* finally thread-safety for all fftw cases, kill FFTW_PLAN_SAFE (PR 354)
+* python interface edge cases (singleton dims, #395). Awaiting #367
+
+V 2.2.0 (08/30/23) (untagged; aka 2.2.0.dev0, a name used to bump python pkg)
 
 * MERGE OF CUFINUFFT (GPU CODE) INTO FINUFFT SOURCE TREE:
   - combined cmake build system via FINUFFT_USE_CUDA flag
@@ -13,7 +18,7 @@ V 2.2.0 (not yet released... planning summer 2023):
   - cufinufft repo will be obsoleted.
   - coding leads on this: Robert Blackwell and Joakim Anden
 * cmake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
-  - Note: the plan is to make the makefiles/make.inc.* obsolete by 2023 end.
+  - Note: the plan is to make the makefiles/make.inc.* obsolete.
 * interp (for type 2) accel by up to 60% in high-acc 2D/3D, by FMA/SIMD-friendly
   rearranging of loops, by Martin Reinecke, PR #292.
 * remove inv array in binsort; speeds up multithreaded case by up to 50%
@@ -25,7 +30,7 @@ V 2.2.0 (not yet released... planning summer 2023):
   remains public, as should be (PR #233). Allows plan to have C++ constructs.
 * fixed single-thread (OMP=OFF) build which failed due to fftw_defs.h/defs.h
 
-CUFINUFFT v 1.3 (06/10/23)
+CUFINUFFT v 1.3 (06/10/23) (Final legacy release outside FINUFFT repo)
 
 * Move second half of onedim_fseries_kernel() to GPU (with a simple heuristic
   basing on nf1 to switch between the CPU and the GPU version).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 311d4feaa..9e3ee23a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.19)
 
-project(finufft VERSION 2.1.0 LANGUAGES C CXX)
+project(finufft VERSION 2.2.0 LANGUAGES C CXX)
 
 set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
 if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
@@ -11,7 +11,12 @@ endif()
 
 include(CTest)
 
-set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
+    # PowerPC arch does not have -march flag.
+    set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
+else()
+    set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
+endif()
 set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
 set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
 
diff --git a/CMakePresets.json b/CMakePresets.json
index 8acb6dc1c..2363692b1 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -55,6 +55,20 @@
                 "FINUFFT_USE_OPENMP": "OFF"
             }
         },
+        {
+            "name": "icx",
+            "binaryDir": "build/icx",
+            "displayName": "Intel Compiler (llvm)",
+            "description": "Build with Intel Compiler",
+            "generator": "Ninja Multi-Config",
+            "cacheVariables": {
+                "CMAKE_C_COMPILER": "icx",
+                "CMAKE_CXX_COMPILER": "icpx",
+                "CMAKE_Fortran_COMPILER": "ifx",
+                "FINUFFT_ARCH_FLAGS": "-xHost",
+                "CMAKE_CXX_FLAGS": "-fp-model=strict"
+            }
+        },
         {
             "name": "icc",
             "binaryDir": "build/icc",
@@ -109,6 +123,11 @@
             "configurePreset": "icc",
             "configuration": "RelWithDebInfo"
         },
+        {
+            "name": "icx",
+            "configurePreset": "icx",
+            "configuration": "RelWithDebInfo"
+        },
         {
             "name": "matlab",
             "configurePreset": "matlab",
diff --git a/Jenkinsfile b/Jenkinsfile
index 89951e3ba..2dc8247c5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -9,30 +9,53 @@ pipeline {
     stage('main') {
       agent {
          dockerfile {
-            filename 'tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64'
+            filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64'
             args '--gpus 2'
+            label 'v100'
          }
       }
       environment {
-    HOME = "$WORKSPACE/build"
+    HOME = "$WORKSPACE"
     PYBIN = "/opt/python/cp38-cp38/bin"
+    LIBRARY_PATH = "$WORKSPACE/build"
+    LD_LIBRARY_PATH = "$WORKSPACE/build"
       }
       steps {
     sh '''#!/bin/bash -ex
       nvidia-smi
     '''
     sh '''#!/bin/bash -ex
-      cp -r /io/build/test/cuda cuda_tests
-      cd cuda_tests
+      echo $HOME
+    '''
+    sh '''#!/bin/bash -ex
+        # v100 cuda arch
+        cuda_arch="70"
+
+        cmake -B build . -DFINUFFT_USE_CUDA=ON \
+                         -DFINUFFT_USE_CPU=OFF \
+                         -DFINUFFT_BUILD_TESTS=ON \
+                         -DCMAKE_CUDA_ARCHITECTURES="$cuda_arch" \
+                         -DBUILD_TESTING=ON
+        cd build
+        make -j4
+    '''
+    sh '''#!/bin/bash -ex
+      cd build/test/cuda
       ctest --output-on-failure
     '''
     sh '${PYBIN}/python3 -m venv $HOME'
     sh '''#!/bin/bash -ex
       source $HOME/bin/activate
       python3 -m pip install --upgrade pip
-      LIBRARY_PATH=/io/build python3 -m pip install -e python/cufinufft
+      python3 -m pip install --upgrade pycuda cupy-cuda112 numba
+      python3 -m pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+      python3 -m pip install -e python/cufinufft
       python3 -m pip install pytest
-      python3 -m pytest
+      python -c "from numba import cuda; cuda.cudadrv.libs.test()"
+      python3 -m pytest --framework=pycuda python/cufinufft
+      python3 -m pytest --framework=numba python/cufinufft
+      python3 -m pytest --framework=cupy python/cufinufft
+      python3 -m pytest --framework=torch python/cufinufft
     '''
       }
     }
diff --git a/README.md b/README.md
index 042fbea10..88f34d84f 100644
--- a/README.md
+++ b/README.md
@@ -3,20 +3,20 @@
 
 Principal author **Alex H. Barnett**,
 main co-developers Jeremy F. Magland,
-Ludvig af Klinteberg, Yu-hsuan "Melody" Shih, Andrea Malleo, Libin Lu,
-and Joakim Andén;
+Ludvig af Klinteberg, Yu-hsuan "Melody" Shih, Libin Lu,
+Joakim Andén, and Robert Blackwell;
 see `docs/ackn.rst` for full list of contributors.
 ​
 <img align="right" src="docs/logo.png" width="350">
 
 <img align="right" src="docs/spreadpic.png" width="400"/>
 
-This is a lightweight CPU library to compute the three standard types of nonuniform FFT to a specified precision, in one, two, or three dimensions. It is written in C++ with interfaces to C, Fortran, MATLAB/octave, Python, and (in a separate [repository](https://github.com/ludvigak/FINUFFT.jl)) Julia. It now contains the GPU CUDA library cuFINUFFT.
+This is a lightweight CPU library to compute the three standard types of nonuniform FFT to a specified precision, in one, two, or three dimensions. It is written in C++ with interfaces to C, Fortran, MATLAB/octave, Python, and (in a separate [repository](https://github.com/ludvigak/FINUFFT.jl)) Julia. It now also integrates the GPU CUDA library cuFINUFFT (which currently does all but type 3).
 
 Please see the [online documentation](http://finufft.readthedocs.io/en/latest/index.html) which can also be downloaded as a [PDF manual](https://finufft.readthedocs.io/_/downloads/en/latest/pdf/).
-You will also want to see example codes in the directories
-`examples`, `test`, `fortran`, `matlab/test`, and `python/test`.
-If you cannot compile, or `pip install`, try our rather outdated [precompiled binaries](http://users.flatironinstitute.org/~ahb/codes/finufft-binaries).
+You will also want to see CPU example codes in the directories `examples`, `test`, `fortran`, `matlab/test`, `matlab/examples`, `python/finufft/test`, etc, and GPU examples in `examples/cuda`, `test/cuda`, etc
+
+If you cannot build via cMake, try the old makefile. Python users try `pip install finufft`. See the docs for details. See our GitHub Issues for tips.
 
 If you prefer to read text files, the source to generate the above documentation is in human-readable (mostly .rst) files as follows:
 
@@ -29,22 +29,22 @@ If you prefer to read text files, the source to generate the above documentation
 - `docs/c_gpu.rst`   : documentation of C++/C function API for GPU library
 - `docs/opts.rst`    : optional parameters
 - `docs/error.rst`   : error codes
-- `docs/trouble.rst` : troubleshooting
+- `docs/trouble.rst` : troubleshooting advice
 - `docs/tut.rst` and `docs/tutorial/*` : tutorial application examples
 - `docs/fortran.rst` : usage examples from Fortran, documentation of interface
 - `docs/matlab.rst` and `docs/matlabhelp.raw` : using the MATLAB/Octave interface
 - `docs/python.rst` and `python/*/_interfaces.py` : using the Python interface
 - `docs/python_gpu.rst` : Python interface to GPU library
-- `docs/julia.rst`   : using the Julia interface
+- `docs/julia.rst`   : options for Julia users
 - `docs/devnotes.rst`: notes/guide for developers
 - `docs/related.rst` : other recommended NUFFT packages
-- `docs/users.rst`   : users of FINUFFT and dependent packages
+- `docs/users.rst`   : some known users of FINUFFT, dependent packages
 - `docs/ackn.rst`    : authors and acknowledgments
-- `docs/refs.rst`    : journal article references (ours and others)
+- `docs/refs.rst`    : journal article references (both ours and others)
 
 
-If you find (cu)FINUFFT useful in your work, please cite this repository and
-the following. For FINUFFT (CPU library):
+If you find (cu)FINUFFT useful in your work, please star this repository and
+cite it and the following. For FINUFFT (CPU library):
 
 A parallel non-uniform fast Fourier transform library based on an ``exponential of semicircle'' kernel.
 A. H. Barnett, J. F. Magland, and L. af Klinteberg.
diff --git a/TODO b/devel/TODO
similarity index 100%
rename from TODO
rename to devel/TODO
diff --git a/devel/agenda_11-27-23.txt b/devel/agenda_11-27-23.txt
new file mode 100644
index 000000000..547e31406
--- /dev/null
+++ b/devel/agenda_11-27-23.txt
@@ -0,0 +1,138 @@
+Goals: make release with some ok and stable stuff in it
+
+
+Release (name = v2.3 or v2.2.1?)  ------------------------------------------
+
+ - presumably a tag on GH, once done
+
+ - 2.2 = 2.2.0dev0 never tagged. Last release was 2.1 in June 2022.
+
+ - binaries available for users (requests eg #362 #285) -> place binaries for
+   3 OSes somewhere on GH, eg "Releases" -> Assets.  [ Can automate?]
+   Kill off my old binary webpage.
+
+
+CI ...........
+
+CI/jenkins/ branch -> failing suddenly 11/22 - why?
+
+CI status runs only CPU tests:
+    spreadtestall.sh  (has no math checking, a small perf test)
+    basicpassfail{f} (exit code checks math)
+    check_finufft.sh (both prec; has math chk + dumbinputs chk)
+    CI doesn't run GPU!
+
+CI improvements:
+    run examples?
+    fortran / other langs ? (complicated in CI...)
+
+PR 254 - CI using cmake(ctest), builds on.
+  - can OMP_NUM_THREADS be set in platform-indep way in ctest add_test() ? no
+    DD says do in C++: https://github.com/flatironinstitute/finufft/pull/254
+    Much easier, but needs a new cmd arg in the test/*.cpp executables
+
+CI py had zero CPU time on windows -> div-by-0 error! (I fixed). why?
+
+
+library.....
+
+[ahb brought in PR 354 thread safety. Simplifies docs too.
+ - jax waiting on this
+ - ask Robert: unclear why some fftw_free or alloc are mutexed and not others?
+ - ahb tested w/ all make tasks.]
+ 
+GPU: PR 330: deal w/ cuda streams     - difficult for ahb to test
+ - jax waiting on this
+ - multicoil MRI users have opinions
+
+remove obsolete bkw-compat (nufft_opts) ?
+
+
+wrappers....
+
+Libin to fix (1,M) vs (M,) Py issue #367 
+
+[Libin did PR for #359, but not closed? I tested py and closed]
+
+pr 308 - no
+
+
+docs....
+
+RTD missing a Secret for webhook to tell GH; RTD won't give it :)
+I changed repo to FI/finufft from ahbarnett/finufft  ... RTD webhook fails
+(code 400). Resync fails.  Checked build (py 3.11) by hand is ok (1 min).
+ask Dylan.
+
+
+Notes for team......
+
+  We need to keep docs/* and CHANGELOG up to date when make changes!
+
+  who tests GPU code and adds more tests, CI?
+
+  Please close an issue after you fix it
+
+
+
+========= FUTURE DECISIONS =============
+
+* Build:
+  - keep makefile, yes, but slowly deprecate (eg Leslie/Alex rely on it) ?
+
+* CI:
+   complete rest of PR 254 for CI via cmake/ctest.
+
+* CPU:
+ * allow FFT switching (plus in cmake but not old makefile?)
+  - see Reinecke pr 287. Benchmarking needed for us, including batched FFTW.
+  - good: no fft plan needed.
+  - bad: installation complexity :( (makefile and CMake)
+
+* GPU:
+ * Type 3 in GPU.
+ * Paquiteau request for multi-coil MRI:
+ having multiples plans with shared memory (e.g. for the NUpts locations and others parameters) could be an alternative ?
+ https://github.com/flatironinstitute/finufft/pull/330
+ * GPU interfaces to py
+ * matlab par-toolbox gpuarray interfaces
+
+* explicit 1D slices for fftw, cuts 2d and 3d fft cost down (Reinecke idea).
+  - #304 has Robert's start in 2D. Also Reinecke has in #287.
+  - but: must work with batching too (batched 1d's with stride, ugh?).
+   - nontrivial, benchmark -> a couple of days solid work.
+
+* improved window shapes, more upsampfacs, like ducc0
+  - several days work
+
+* SIMD spread/interp/sort
+  - significant work, but big potential gains
+
+* Overall, consider joining XSDK via conforming to their good policies:
+https://github.com/xsdk-project/xsdk-community-policies
+Mandatory policies: we already conform to
+M2, M4 (need NERSC etc CMake presets...), M7, M9, M10, M11, M14, M17 (I think).
+I don't understand: M12.
+Not relevant: M3.
+Leaves to do:
+M1 - spack installation route
+M5 - reliable contact for dev team - easy
+M8 - runtime API to get version # - easy
+M13 - we don't really have a make install. But local locations correct.
+M16 - our DEBUG is runtime, not a build option. Need add -g in a cmake task.
+CI is not mentioned.
+
+* source code formatting?
+
+* see devel/TODO - and make this file current by thinking/killing issues there
+
+* Marco plan April through Aug 2024
+
+
+
+========= Random Questions ==========
+
+#569 ? onedim_fseries_kernel  ... was not brought in
+
+Can I kill .travis.yml ? (3 yrs old - what's it for?)
+
diff --git a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
index 091b2a9d7..c80a53f08 100644
--- a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
+++ b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
@@ -52,9 +52,11 @@ SMALL TASKS IMMEDIATE:
 * DOCS: (Alex will update RST docs for cuda, incl its PNG). DONE.
  -INSTALL: add how to find SM_?? cuda arch [DONE]
 
-* test issue #276 re meth=1,2 - Alex will update
+* test issue #276 re meth=1,2 - Alex will update. DONE
+
+* cufinufft has C and C++ - J make issue  [<- Alex forgot what this meant??]
+[was merely problem with headers in using gcc] - DONE.
 
-* cufinufft has C and C++ - J make issue  <- Alex forgot what this meant??
 
 * see if gpu error codes can be merged into cpu - the only new one
   could be cudaMalloc failed.
diff --git a/devel/finufft_meeting-7-5-23.txt b/devel/finufft_meeting-7-5-23.txt
new file mode 100644
index 000000000..3e843d84c
--- /dev/null
+++ b/devel/finufft_meeting-7-5-23.txt
@@ -0,0 +1,73 @@
+
+Aiming for v2.2 release late-July 2023.
+(Save cufinufft t3 and FFT-sparsity exploits for v2.3, in Fall '23).
+
+
+TO DO:
+
+* cufinufft_default_opts needs to match finufft - remove dims and type
+Sets defaults: gpu_sort =1, gpu_method=0 (means auto-choose)
+
+gnu_binsize{x,y,z} = 0 amd change later default-setting to
+
+cufinufft_makeplan: don't change user opts struct,
+just change the deep copy sitting in the plan object.
+if gpu_meth = 0, auto-choose it (1,2,4).
+
+Rationalize naming of opts: (enums?)
+
+gpu_meth = 1 is GM    (for type 1 or 2, and best for type 2 only)
+gpu_meth = 2 is SM for spread (best for type-1); gpu_sort then ignored (check?)
+gpu_meth = 4 is block-gather, for spread, only in 3d - experimental
+
+gpu_sort =0 : GM
+gpu_sort =1 : GM-sort    (default)
+
+(Py internal _default_opts needs to be updated to match)
+
+
+* Joakim dtype Py deprecate with attribute error dtype=Float64
+  cufinufft.nufft simple interfaces (only for Py), add to docs, Joakim.
+
+
+* cufinufft tests:
+PR #303, merge.
+Maybe Alex test this.
+
+* gpu error codes merge them into cpu, add GPU ALLOC ERROR.
+Ie add GPU-specific err codes to l.42 include/finufft/defs.h
+Not have cufinufft give ier=1 randomly. Issue #284.
+Test for cuda kernel launch failures? we don't currently. Issue #273.
+
+* bring in as many small PRs as possible and close issues.
+
+
+
+LESS URGENT TO DO:
+
+* alex "answer" discussions.
+* Julia wrapper for gpu code - Ludvig?
+* matlab wrapper to gpu code - gpuArray.
+* py-project.TOML 3.12 obsoleting dist-utils. But can still use setup.py
+  [for 2024]
+* Exploit sparsity of 2d/3d FFT a la Reinecke suggestion, and Robert demo.
+  Esp. important for cuFFT GPU version (upsampfac=2, so nf>=2N in each dim)
+  For CPU code a little tricky to decide ntrans>1, multithreading.
+
+
+
+TO NOT DO:
+
+* PR 306 py expose spread/interp, leave for now
+
+
+DONE:
+
+folder struct reorg
+
+Reinecke perf improvements CPU: binsorting; interp in 2d/3d.
+
+1.3 legacy cuf repo.
+
+docs
+
diff --git a/devel/plans_fall23.txt b/devel/plans_fall23.txt
new file mode 100644
index 000000000..359d8cc8b
--- /dev/null
+++ b/devel/plans_fall23.txt
@@ -0,0 +1,27 @@
+FINUFFT (CPU+GPU) plans, Fall 2023:
+
+pick meeting date
+
+* CPU spreader/interp, bring in Wenda's stuff -> Marco?
+
+* GPU type 3   Marco?
+
+* implement Reinecke's sparse sliced FFT, w/ many-vectors too. CPU
+Libin help?
+
+* binaries release with GH Releases feature (Assets?). No crucial.
+
+* CPU perf tests, standardized way to benchmark spread/interp, FFT, H<>D, etc.
+Add to docs/devnotes.rst how to run benchmarks.
+Robert + Joakim
+
+* tutorial in docs: on eg inverse FFT by CG iteration.
+
+* doc for py local install #340
+
+* #340 docs for GPU simple interface
+
+Go through Issues & prioritize.
+
+PRs.
+
diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index 4c6dc7175..21a4e3170 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -295,10 +295,8 @@ To create such a structure, use:
 .. code-block:: c
 
     cufinufft_opts opts;
-    ier = cufinufft_default_opts(type, dim, &opts);
+    cufinufft_default_opts(&opts);
 
-where ``type`` and ``dim`` are as above in ``cufinufft_makeplan``.
-As before, the returned value is zero if success, otherwise an error occurred.
 Then you may change fields of ``opts`` by hand, finally pass ``&opts`` in as the last argument to ``cufinufft_makeplan`` or ``cufinufftf_makeplan``.
 The options fields are currently only documented in the ``include/cufinufft_opts.h``.
 
diff --git a/docs/cex.rst b/docs/cex.rst
index 1f824bc8b..b1b9b2a1f 100644
--- a/docs/cex.rst
+++ b/docs/cex.rst
@@ -217,8 +217,8 @@ See ``examples/many1d1.cpp`` and ``test/finufft?dmany_test.cpp``
 for more examples.
 
 
-Guru interface example
-----------------------
+Guru interface examples
+-----------------------
 
 If you want more flexibility than the above, use the "guru" interface:
 this is similar to that of FFTW3, and to the main interface of
@@ -268,42 +268,40 @@ is needed since the Fourier coefficient dimensions are passed as an array.
 The complete code with a math test is in ``examples/guru2d1.cpp``, and for
 more examples see ``examples/guru1d1*.c*``
 
+Using the guru interface to perform a vectorized transform (multiple 1D type 1
+transforms each with the same nonuniform points) is demonstrated in
+``examples.gurumany1d1.cpp``. This is similar to the single-command vectorized
+interface, but allowing more control (changing the nonuniform points without
+re-planning the FFT, etc).
 
-Thread safety and global state
-------------------------------
+
+Thread safety for single-threaded transforms, and global state
+--------------------------------------------------------------
 
 It is possible to call FINUFFT from within multithreaded code, e.g. in an
-OpenMP parallel block. In this case ``opts.nthreads=1`` should be set,
-and FINUFFT must have been compiled with the ``-DFFTW_PLAN_SAFE`` flag,
-making it thread-safe.
-For demos of this, see
+OpenMP parallel block. In this case ``opts.nthreads=1`` should be set, otherwise
+a segfault will occur. This is useful if you don't want to synchronize
+independent transforms.
+For demos of this "parallelize over single-threaded transforms" use case, see
+the following, which are built as part of the ``make examples`` task:
 
 * ``examples/threadsafe1d1`` which runs a 1D type-1 separately on each thread, checking the math, and
 
-* ``examples/threadsafe2d2f`` which runs a 2D type-2 on each slice, which are parallelized over via an OpenMP parallel for loop (without any math check, just dummy inputs)
-
-which are both built by ``make examples`` if the above flag has been set.
-
-Note: A design decision of FFTW is to have a global state which stores
-wisdom and settings. Such global state can cause unforeseen effects on other routines that also use FFTW. In contrast, FINUFFT uses pointers to plans to store
-its state, and does not have a global state (other than one ``static``
-flag used as a lock on FFTW initialization in the FINUFFT plan
-stage). This means different FINUFFT calls should not affect each other,
-although they may affect other codes that use FFTW via FFTW's global state.
-
-Alternatively  it is possible to achieve thread safety without compiling with ``-DFFTW_PLAN_SAFE``. This can lead to higher performance.
-This way to achieve thread safety is to use the guru interface and split initialization and transformation into two steps. The initialization step is not thread safe, but the transformation step is. The initialization step can be parallelized over threads, but the transformation step cannot. See ``examples/threadsafe1d1`` for an example of this.
-In particular:
-
-.. code-block:: C++
+* ``examples/threadsafe2d2f`` which runs a 2D type-2 on each "slice" (in the MRI
+  language), parallelized over slices with an OpenMP parallel for loop.
+  (In this code there is no math check, just status check.)
 
-  finufft_makeplan(...args...) // not thread safe
+However, if you have multiple transforms with the *same* nonuniform points for
+each transform, it is probably much faster to use the vectorized interface,
+and do all these transforms with a single such multithreaded FINUFFT call
+(see ``examples/many1d1.cpp`` and ``examples/gurumany1d1.cpp``).
+This may be less convenient if you want to leave your slices unsynchronized.
 
-All the other functions are thread safe. With OpenMP is possible to use:
-  
-.. code-block:: C++
-                
-  #pragma omp critical
-  finufft_makeplan(...args...) // not thread safe
-
-to achieve thread safety.
+.. note::
+   A design decision of FFTW is to have a global state which stores
+   wisdom and settings. Such global state can cause unforeseen effects on other
+   routines that also use FFTW. In contrast, FINUFFT uses pointers to plans to store
+   its state, and does not have a global state (other than one ``static``
+   flag used as a lock on FFTW initialization in the FINUFFT plan
+   stage). This means different FINUFFT calls should not affect each other,
+   although they may affect other codes that use FFTW via FFTW's global state.
diff --git a/docs/conf.py b/docs/conf.py
index 2148db4ff..f67d4b6a0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,10 +21,11 @@
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath('../python'))
+sys.path.insert(0, os.path.abspath('../python/finufft'))
+sys.path.insert(0, os.path.abspath('../python/cufinufft'))
 
 # quash sphinx's inevitable failure of C++ lib extension to import...
-autodoc_mock_imports = ['finufft._finufft', 'numpy']
+autodoc_mock_imports = ['finufft._finufft', 'numpy', 'pycuda', 'cufinufft._cufinufft']
 # The above is not enough for nested import -- forcibly mock them out ahead of time:
 #for name in autodoc_mock_imports:
 #    sys.modules[name] = sphinx.ext.autodoc._MockModule(name, None)
@@ -73,9 +74,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = u'2.1'
+version = u'2.2'
 # The full version, including alpha/beta/rc tags.
-release = u'2.1.0'
+release = u'2.2.0.dev0'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index a8229e549..96fb181a8 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -9,8 +9,8 @@ Developer notes
 
   - ``CMakeLists.txt`` for cmake
   - ``docs/conf.py`` for sphinx
-  - ``python/setup.py`` for the python pkg version
-  - ``python/finufft/__init__.py``
+  - ``python/finufft/setup.py`` for the python pkg version
+  - ``python/cufinufft/setup.py`` for the GPU python pkg version
   - ``include/finufft/defs.h`` for the debug>0 output
   - ``matlab/Contents.m`` for the MATLAB/Octave help
   - ``CHANGELOG``: don't forget to describe the new features and changes, folding lines at 80 chars.
@@ -21,14 +21,14 @@ Developer notes
 
 * For testing and performance measuring routines see ``test/README`` and ``perftest/README``. We need more of the latter, eg, something making performance graphs that enable rapid eyeball comparison of various settings/machines.
 
-* Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build variants OMP=OFF nor FFTW_PLAN_SAFE options. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
+* Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build the variant OMP=OFF. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
 
 * **Installing MWrap**. This is needed only for experts to rebuild the matlab/octave interfaces.
   `MWrap <https://github.com/zgimbutas/mwrap>`_
   is a very useful MEX interface generator by Dave Bindel, now maintained
   and expanded by Zydrunas Gimbutas.
   Make sure you have ``flex`` and ``bison`` installed to build it.
-  As of FINUFFT v.2.0 you will need a recent (>=0.33.10) version of MWrap.
+  As of FINUFFT v2.0 you will need a recent (>=0.33.10) version of MWrap.
   Make sure to override the location of MWrap by adding a line such as::
 
     MWRAP = your-path-to-mwrap-executable
diff --git a/docs/error.rst b/docs/error.rst
index 68df90c62..e40d8e256 100644
--- a/docs/error.rst
+++ b/docs/error.rst
@@ -5,7 +5,7 @@ Error (status) codes
 
 In all FINUFFT interfaces, the returned value ``ier`` is a status indicator.
 It is ``0`` if successful, otherwise the error code
-has the following meanings (see ``include/defs.h``):
+has the following meanings (see ``include/finufft_errors.h``):
 
 ::
 
@@ -22,7 +22,13 @@ has the following meanings (see ``include/defs.h``):
   11 general allocation failure
   12 dimension invalid
   13 spread_thread option invalid
-  
+  14 invalid mode array (more than ~2^31 modes, dimension with 0 modes, etc)
+  15 cuda failure (failure to call any cuda function/kernel)
+  16 attempt to destroy an uninitialized plan
+  17 invalid spread/interp method for dim (attempt to blockgather in 1D, e.g.)
+  18 size of bins for subprob/blockgather invalid
+  19 GPU shmem too small for subprob/blockgather parameters
+
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 
 For any other nonzero values of ``ier`` the transform may not have been performed and the output should not be trusted. However, we hope that the value of ``ier`` will help to narrow down the problem.
diff --git a/docs/install.rst b/docs/install.rst
index 6d379580e..3cefefbfa 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -26,7 +26,7 @@ Then add the following to your ``CMakeLists.txt``:
   CPMAddPackage(
     NAME             Finufft
     GIT_REPOSITORY   https://github.com/flatironinstitute/finufft.git    
-    GIT_TAG          v2.1.0 
+    GIT_TAG          master
     GIT_SHALLOW      Yes
     GIT_PROGRESS     Yes
     EXCLUDE_FROM_ALL Yes 
@@ -81,6 +81,10 @@ For example, to configure, build and test the development preset (which builds t
 From other CMake projects, to use ``finufft`` as a library, simply add this repository as a subdirectory using
 ``add_subdirectory``, and use ``target_link_library(your_executable finufft)``.
 
+.. note::
+
+   CMake compiling on linux at Flatiron Institute (Rusty cluster). We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
+
 
 Old GNU make based route
 ------------------------
@@ -260,8 +264,6 @@ This is for experts.
 Here are all the flags that the FINUFFT source responds to.
 Activate them by adding a line of the form ``CXXFLAGS+=-DMYFLAG`` in your ``make.inc``:
 
-* ``-DFFTW_PLAN_SAFE``: This makes FINUFFT call ``fftw_make_planner_thread_safe()`` as part of its FFTW3 planner stage; see http://www.fftw.org/fftw3_doc/Thread-safety.html. This makes FINUFFT thread-safe. See ``examples/threadsafe1d1.cpp`` and ``examples/threadsafe2d2f.cpp``. This is only available in FFTW version >=3.3.6; for this reason it is not yet the default. If you get segfault on these examples, try ``FFTWOMPSUFFIX = threads`` as explained below.
-
 * ``-DSINGLE``: This is internally used by our build process to switch
   (via preprocessor macros) the source from double to single precision.
   You should not need to use this flag yourself.
@@ -356,14 +358,20 @@ The GCC route
 ~~~~~~~~~~~~~~
 
 This is less recommended, unless you need to link from ``gfortran``, when it
-appears to be essential. We have tested on Movaje::
+appears to be essential. The basic idea is::
 
-  cp make.inc.macosx_gcc-8 make.inc
+  cp make.inc.macosx_gcc-12 make.inc
   make test -j
   make fortran
 
 which also compiles and tests the fortran interfaces.
-In Catalina you'll probably need to edit to ``g++-10`` in your ``make.inc``.
+You may need to edit to ``g++-11``, or whatever your GCC version is,
+in your ``make.inc``.
+
+.. note::
+
+   A problem between GCC and the new XCode 15 requires a workaround to add ``LDFLAGS+=-ld64`` to force the old linker to be used. See the above file ``make.inc.macosx_gcc-12``.
+
 We find python may be built as :ref:`below<install-python>`.
 We found that octave interfaces do not work with GCC; please help.
 For MATLAB, the MEX settings may need to be
diff --git a/docs/python_gpu.rst b/docs/python_gpu.rst
index 6ebbcfc59..140c8b557 100644
--- a/docs/python_gpu.rst
+++ b/docs/python_gpu.rst
@@ -66,3 +66,4 @@ Full documentation
 .. automodule:: cufinufft
     :members:
     :member-order: bysource
+    :undoc-members:
diff --git a/docs/trouble.rst b/docs/trouble.rst
index 4d9711600..2b7b8114a 100644
--- a/docs/trouble.rst
+++ b/docs/trouble.rst
@@ -70,7 +70,7 @@ Crash (segfault) issues and advice
 
 - Maybe you have switched off nonuniform point bounds checking (``opts.chkbnds=0``) for a little extra speed? Try switching it on again to catch illegal coordinates.
 
-- Thread-safety: are you calling FINUFFT from inside a multithreaded block of code, without compiling with ``-DFFTW_PLAN_SAFE`` or setting ``opts.nthreads=1``? If ``gdb`` indicates crashes during FFTW calls, this is another sign.
+- Thread-safety: are you calling FINUFFT from inside a multithreaded block of code without setting ``opts.nthreads=1``? If ``gdb`` indicates crashes during FFTW calls, this is another sign.
   
 - To isolate where a crash is occurring, set ``opts.debug`` to 1 or 2, and check the text output of the various stages. With a debug setting of 2 or above, when ``ntrans>1`` a large amount of text can be generated.
     
diff --git a/docs/tut.rst b/docs/tut.rst
index 19476052c..7dcc07b87 100644
--- a/docs/tut.rst
+++ b/docs/tut.rst
@@ -27,5 +27,5 @@ For further applications, see :ref:`references <refs>`, and:
      
 * The numerical sampling of `random plane waves <https://users.flatironinstitute.org/~ahb/rpws/>`_.
    
-* These seminar `PDF slides <http://users.flatironinstitute.org/~ahb/notes/wam19.pdf>`_.
+* These seminar `PDF slides <http://users.flatironinstitute.org/~ahb/talks/pacm20.pdf>`_.
 
diff --git a/docs/users.rst b/docs/users.rst
index bbcd80ee7..e689a9e94 100644
--- a/docs/users.rst
+++ b/docs/users.rst
@@ -76,7 +76,10 @@ For the latest see: Google Scholar `FINUFFT citations <https://scholar.google.co
 
 #. Chang, P., Pienaar, E., & Gebbie, T. (2020). "Malliavin--Mancino Estimators Implemented with Nonuniform Fast Fourier Transforms." SIAM J. Sci. Comput. 42(6), B1378–B1403. https://doi.org/10.1137/20m1325903 
 
-   
+#. Heisenberg voxelization (HVOX) for inteferometry of spherical sky maps in radio-astronomy, by Kashani, Simeoni, et al. (2023) https://arxiv.org/abs/2306.06007 https://github.com/matthieumeo/hvox
+
+
+
 Papers or codes using our new ES window (spreading) function but not the whole FINUFFT package:
 
 1. Davood Shamshirgar and Anna-Karin Tornberg, "Fast Ewald summation for electrostatic potentials with arbitrary periodicity", exploit our "Barnett-Magland" (BM), aka exp-sqrt (ES) window function. https://arxiv.org/abs/1712.04732
@@ -86,9 +89,6 @@ Papers or codes using our new ES window (spreading) function but not the whole F
 #. S. Jiang and L. Greengard,
    new multilevel kernel-split faster 3D FMM.
 
-#. Heisenberg voxelization (HVOX) for inteferometry of spherical sky maps in radio-astronomy, by Kashani, Simeoni, et al.  https://arxiv.org/abs/2306.06007
-   https://github.com/matthieumeo/hvox
-
 
 Papers influenced by other aspects of FINUFFT:
 
@@ -109,4 +109,4 @@ Some citations to FINUFFT that do not appear to be actual users
 
 #. https://arxiv.org/abs/2010.05295
       
-#. now too many to track by hand...
+Now too many to track by hand... see Google Scholar search above.
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index e70008d42..1470e0269 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,4 +1,5 @@
-set(EXAMPLES guru1d1 guru1d1f guru2d1 many1d1 simple1d1 simple1d1f simulplans1d1 threadsafe1d1 threadsafe2d2f)
+set(EXAMPLES guru1d1 guru1d1f guru2d1 many1d1 simple1d1 simple1d1f simulplans1d1)
+set(EXAMPLES_OPENMP threadsafe1d1 threadsafe2d2f)
 set(EXAMPLES_C guru1d1c simple1d1c simple1d1cf)
 
 foreach(EXAMPLE ${EXAMPLES})
@@ -11,6 +12,13 @@ foreach(EXAMPLE ${EXAMPLES_C})
     target_link_libraries(${EXAMPLE} PRIVATE finufft)
 endforeach()
 
+if(FINUFFT_USE_OPENMP)
+  foreach(EXAMPLE ${EXAMPLES_OPENMP})
+    add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+    target_link_libraries(${EXAMPLE} PRIVATE finufft)
+  endforeach()
+endif()
+
 if (FINUFFT_USE_CUDA)
   add_subdirectory(cuda)
 endif()
diff --git a/examples/cuda/example2d2many.cpp b/examples/cuda/example2d2many.cpp
index 5c0b07ba2..f35b10205 100644
--- a/examples/cuda/example2d2many.cpp
+++ b/examples/cuda/example2d2many.cpp
@@ -83,7 +83,7 @@ int main(int argc, char *argv[])
     nmodes[2] = 1;
 
     cufinufft_opts opts;
-    cufinufft_default_opts(type, dim, &opts);
+    cufinufft_default_opts(&opts);
     opts.gpu_maxbatchsize = maxbatchsize;
 
     ier = cufinufft_makeplan(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
diff --git a/examples/guru1d1.cpp b/examples/guru1d1.cpp
index 77bd841a0..eb7189da0 100644
--- a/examples/guru1d1.cpp
+++ b/examples/guru1d1.cpp
@@ -7,6 +7,7 @@
 #include <vector>
 #include <stdio.h>
 #include <stdlib.h>
+#include <cassert>
 
 // only good for small projects...
 using namespace std;
@@ -18,10 +19,10 @@ int main(int argc, char* argv[])
    pointers to STL vectors of C++ double complex numbers, with a math check.
    Barnett 2/27/20
 
-   Compile on linux with:
-   g++-7 -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1  -lfftw3 -lfftw3_omp -lm
+   Compile on linux with (or see ../makefile):
+   g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1  -lfftw3 -lfftw3_omp -lm
 
-   Or if you have built a single-core library, remove -fopenmp and -lfftw3_omp
+   Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp
 
    Usage: ./guru1d1
 */
@@ -49,7 +50,7 @@ int main(int argc, char* argv[])
   for (int j=0; j<M; ++j)
     x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
   // note FINUFFT doesn't use std::vector types, so we need to make a pointer...
-  finufft_setpts(plan, M, &x[0], NULL, NULL, 0, NULL, NULL, NULL);
+  finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
   
   // generate some complex strengths
   vector<complex<double>> c(M);
@@ -58,17 +59,18 @@ int main(int argc, char* argv[])
 
   // alloc output array for the Fourier modes, then do the transform
   vector<complex<double>> F(N);
-  int ier = finufft_execute(plan, &c[0], &F[0]);
+  int ier = finufft_execute(plan, c.data(), F.data());
 
   // for fun, do another with same NU pts (no re-sorting), but new strengths...
   for (int j=0; j<M; ++j)
     c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
-  ier = finufft_execute(plan, &c[0], &F[0]);
+  ier = finufft_execute(plan, c.data(), F.data());
 
-  finufft_destroy(plan);    // done with transforms of this size
+  finufft_destroy(plan);    // don't forget! done with transforms of this size
 
   // rest is math checking and reporting...
   int n = 142519;   // check the answer just for this mode
+  assert(n>=-(double)N/2 && n<(double)N/2);     // ensure meaningful test
   complex<double> Ftest = complex<double>(0,0);
   for (int j=0; j<M; ++j)
     Ftest += c[j] * exp(1i*(double)n*x[j]);
diff --git a/examples/gurumany1d1.cpp b/examples/gurumany1d1.cpp
new file mode 100644
index 000000000..8f150e609
--- /dev/null
+++ b/examples/gurumany1d1.cpp
@@ -0,0 +1,80 @@
+/* Demonstrate guru FINUFFT interface performing a stack of 1d type 1 
+   transforms in a single execute call. See guru1d1.cpp for other guru
+   features demonstrated.
+
+   Compile on linux with (or use the ../makefile):
+   g++ -fopenmp gurumany1d1.cpp -I../include ../lib/libfinufft.so -o gurumany1d1
+
+   Usage: ./gurumany1d1           (exit code 0 indicates success)
+
+   Barnett 11/22/23
+*/
+
+// this is all you must include for the finufft lib...
+#include <finufft.h>
+#include <complex>
+
+// specific to this demo...
+#include <math.h>
+#include <vector>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cassert>
+
+// only good for small projects...
+using namespace std;
+// allows 1i to be the imaginary unit... (C++14 onwards)
+using namespace std::complex_literals;
+
+int main(int argc, char* argv[])
+{
+  int M = 2e5;            // number of nonuniform points
+  int N = 1e5;            // number of modes
+  double tol = 1e-9;      // desired accuracy
+  int ntrans = 100;       // request a bunch of transforms in the execute
+  int isign = +1;         // sign of i in the transform math definition
+  
+  int type = 1, dim = 1;     // 1d1
+  int64_t Ns[3] = {N,0,0};   // guru describes mode array by vector [N1,N2..]
+  finufft_plan plan;         // creates a plan struct (NULL below: default opts)
+  finufft_makeplan(type, dim, Ns, isign, ntrans, tol, &plan, NULL);
+
+  // generate random nonuniform points and pass to FINUFFT
+  vector<double> x(M);
+  for (int j=0; j<M; ++j)
+    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
+  finufft_setpts(plan, M, x.data(), NULL, NULL, 0, NULL, NULL, NULL);
+  
+  // generate ntrans complex strength vectors each of length M (the slow bit!)
+  vector<complex<double>> c(M*ntrans);        // plain contiguous storage
+  for (int j=0; j<M*ntrans; ++j)
+    c[j] = 2*((double)rand()/RAND_MAX)-1 + 1i*(2*((double)rand()/RAND_MAX)-1);
+
+  // alloc output array for the Fourier modes, then do the transform
+  vector<complex<double>> F(N*ntrans);
+  printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms (vectorized), each size %d NU pts to %d modes...\n",tol,ntrans,M,N);
+  int ier = finufft_execute(plan, c.data(), F.data());
+
+  // could now change c, do another execute, do another setpts, execute, etc...
+  
+  finufft_destroy(plan);  // don't forget! we're done with transforms of this size
+  
+  // rest is math checking and reporting...
+  int k = 42519;   // check the answer just for this mode
+  int trans = 71;   // ...testing in just this transform
+  assert(k>=-(double)N/2 && k<(double)N/2);     // ensure meaningful test
+  assert(trans>=0 && trans<ntrans);
+  complex<double> Ftest = complex<double>(0,0);
+  for (int j=0; j<M; ++j)
+    Ftest += c[j+M*trans] * exp(1i*(double)k*x[j]);   // c offset to trans
+  double Fmax = 0.0;       // compute inf norm of F for selected transform
+  for (int m=0; m<N; ++m) {
+    double aF = abs(F[m+N*trans]);
+    if (aF>Fmax) Fmax=aF;
+  }
+  int nout = k+N/2 + N*trans;   // output index for freq mode k in the trans
+  double err = abs(F[nout] - Ftest)/Fmax;
+  printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n",ier,trans,k,err);
+
+  return ier;
+}
diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp
index 5656df7d5..f25f25b8b 100644
--- a/examples/threadsafe1d1.cpp
+++ b/examples/threadsafe1d1.cpp
@@ -13,13 +13,12 @@ using namespace std;
 int main(int argc, char* argv[])
 /* Demo single-threaded FINUFFT calls from inside a OMP parallel block.
    Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test.
-   Barnett 4/19/21, eg for Goran Zauhar, issue #183.
+   Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp.
 
-   Notes: libfinufft *must* have been built with -DFFTW_PLAN_SAFE, which needs
-   FFTW >= 3.3.6. You also may not have libfftw3_omp, so I have switched to
+   Notes: You may not have libfftw3_omp, so I have switched to
    libfftw3_threads in this suggested compile command:
 
-   g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1 -lfftw3 -lfftw3_threads -lm
+   g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1
 
    Usage: ./threadsafe1d1
 
@@ -34,21 +33,22 @@ int main(int argc, char* argv[])
   finufft_default_opts(opts);
   complex<double> I = complex<double>(0.0,1.0);  // the imaginary unit
   
-  // generate some random nonuniform points (x) and complex strengths (c)...
-  vector<double> x(M);
-  for (int j=0; j<M; ++j)
-    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
-
-  opts->nthreads=1;       // this is *crucial* so that each call single-thread
+  opts->nthreads=1;       // *crucial* so that each call single-thread (otherwise segfaults)
 
   // Now have each thread do independent 1D type 1 on their own data:
 #pragma omp parallel
   {
-  // generate some complex strengths (c)... local to the thread
+  // generate some random nonuniform points (x) and complex strengths (c)...
+  // Note that these are local to the thread (if you have the *same* sets of
+  // NU pts x for each thread, consider instead using one vectorized multithreaded
+  // transform, which would be faster).
+  vector<double> x(M);
   vector<complex<double> > c(M);
-  for (int j=0; j<M; ++j)
+  for (int j=0; j<M; ++j) {
+    x[j] = M_PI*(2*((double)rand()/RAND_MAX)-1);  // uniform random in [-pi,pi)
     c[j] = 2*((double)rand()/RAND_MAX)-1 + I*(2*((double)rand()/RAND_MAX)-1);
-
+  }
+    
   // allocate output array for the Fourier modes... local to the thread
   vector<complex<double> > F(N);
 
@@ -70,5 +70,6 @@ int main(int argc, char* argv[])
   
   printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",omp_get_thread_num(),ier,k,err);
   }
+  
   return 0;
 }
diff --git a/examples/threadsafe2d2f.cpp b/examples/threadsafe2d2f.cpp
index c7ccfde67..9844af54a 100644
--- a/examples/threadsafe2d2f.cpp
+++ b/examples/threadsafe2d2f.cpp
@@ -1,15 +1,22 @@
 /* This is a 2D type-2 demo calling single-threaded FINUFFT inside an OpenMP
-   loop, to show thread-safety. It is closely based on a test code of Penfe,
+   loop, to show thread-safety with independent transforms, one per thread.
+   It is based on a test code of Penfe,
    submitted github Issue #72. Unlike threadsafe1d1, it does not test the math;
-   it is the shell of an application from MRI reconstruction.
+   it is the shell of an application from multi-coil/slice MRI reconstruction.
+   Note that since the NU pts are the same in each slice, in fact a vectorized
+   multithreaded transform could do all these slices together, and faster.
 
-   FINUFFT lib must be built with (non-default) flag -DFFTW_PLAN_SAFE, and
-   FFTW version >= 3.3.6. See http://www.fftw.org/fftw3_doc/Thread-safety.html
+   To compile (note uses threads rather than omp version of FFTW3):
 
-   Then to compile (note uses threads rather than omp version of FFTW3):
+   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g -Wall
 
-   g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -lfftw3 -lfftw3_threads -lm -g -Wall
+   ./threadsafe2d2f                                   <-- use all threads
+   OMP_NUM_THREADS=1 ./threadsafe2d2f                 <-- sequential, 1 thread
 
+   Expected output is 50 lines, each showing exit code 0. It's ok if they're
+   mangled due to threads writing to stdout simultaneously.
+
+   Barnett, tidied 11/22/23
 */
 
 // this is all you must include for the finufft lib...
@@ -22,34 +29,37 @@
 #include <omp.h>
 using namespace std;
 
-void test_finufft(finufft_opts* opts)
-// self-contained small test that single-prec FINUFFT2D2 no error nor crash
+int test_finufft(finufft_opts* opts)
+  // self-contained small test that one single-prec FINUFFT2D2 has no error/crash
 {
-    size_t n_rows = 8, n_cols = 8;
-    size_t n_read = 16, n_spokes = 4;
-    std::vector<float> x(n_read * n_spokes);       // bunch of zero data
-    std::vector<float> y(n_read * n_spokes);
-    std::vector<std::complex<float>> img(n_rows * n_cols);
-    std::vector<std::complex<float>> ksp(n_read * n_spokes);
+  size_t n_rows = 256, n_cols = 256;       // 2d image size
+  size_t n_read = 512, n_spokes = 128;     // some k-space point params
+  size_t M = n_read*n_spokes;              // how many k-space pts; MRI-specific
+  std::vector<float> x(M);                 // bunch of zero input data
+  std::vector<float> y(M);
+  std::vector<std::complex<float>> img(n_rows * n_cols);    // coeffs
+  std::vector<std::complex<float>> ksp(M);     // output array (vals @ k-space pts)
 
-    int ier = finufftf2d2(n_read * n_spokes, x.data(), y.data(), ksp.data(),
-                          -1, 1e-3, n_rows, n_cols, img.data(), opts);
+  int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(),
+                        -1, 1e-3, n_rows, n_cols, img.data(), opts);
 
-    std::cout << "\ttest_finufft: " << ier << ", thread " << omp_get_thread_num() << std::endl;
+  std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() << std::endl;
+  return ier;
 }
 
 int main(int argc, char* argv[])
 {
   finufft_opts opts;
   finufftf_default_opts(&opts);
-  opts.nthreads = 1;     // this is *crucial* so that each call single-thread
-
-  int n_slices = 50;     // parallelize over slices
-#pragma omp parallel for num_threads(8)
-  for (int i = 0; i < n_slices; i++)
-    {
-      test_finufft(&opts);
-    }
+  opts.nthreads = 1;     // *crucial* so each call single-thread; else segfaults
+
+  int n_slices = 50;     // number of transforms. parallelize over slices
+  int overallstatus=0;
+#pragma omp parallel for
+  for (int i = 0; i < n_slices; i++) {
+    int ier = test_finufft(&opts);
+    if (ier!=0) overallstatus=1;
+  }
   
-  return 0;
+  return overallstatus;
 }
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 106ea0d40..17ed6202b 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -1,8 +1,8 @@
 // Defines the C++/C user interface to CUFINUFFT library.
 #include <cufft.h>
 
-#include <cufinufft_errors.h>
 #include <cufinufft_opts.h>
+#include <finufft_errors.h>
 
 typedef struct cufinufft_plan_s *cufinufft_plan;
 typedef struct cufinufft_fplan_s *cufinufftf_plan;
@@ -10,12 +10,12 @@ typedef struct cufinufft_fplan_s *cufinufftf_plan;
 #ifdef __cplusplus
 extern "C" {
 #endif
-int cufinufft_default_opts(int type, int dim, cufinufft_opts *opts);
+void cufinufft_default_opts(cufinufft_opts *opts);
 
-int cufinufft_makeplan(int type, int dim, int64_t *n_modes, int iflag, int ntr, double eps, cufinufft_plan *d_plan_ptr,
-                       cufinufft_opts *opts);
-int cufinufftf_makeplan(int type, int dim, int64_t *n_modes, int iflag, int ntr, float eps, cufinufftf_plan *d_plan_ptr,
-                        cufinufft_opts *opts);
+int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, double eps,
+                       cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps,
+                        cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
 
 int cufinufft_setpts(cufinufft_plan d_plan, int M, double *h_kx, double *h_ky, double *h_kz, int N, double *h_s,
                      double *h_t, double *h_u);
diff --git a/include/cufinufft/contrib/cuda_samples/helper_cuda.h b/include/cufinufft/contrib/cuda_samples/helper_cuda.h
deleted file mode 100644
index 5378bfc11..000000000
--- a/include/cufinufft/contrib/cuda_samples/helper_cuda.h
+++ /dev/null
@@ -1,906 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-////////////////////////////////////////////////////////////////////////////////
-// These are CUDA Helper functions for initialization and error checking
-
-#ifndef COMMON_HELPER_CUDA_H_
-#define COMMON_HELPER_CUDA_H_
-
-#pragma once
-
-#include <stdint.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <helper_string.h>
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// Note, it is required that your SDK sample to include the proper header
-// files, please refer the CUDA examples for examples of the needed CUDA
-// headers, which may change depending on which CUDA functions are used.
-
-// CUDA Runtime error messages
-#ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
-#endif
-
-#ifdef CUDA_DRIVER_API
-// CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error) {
-    static char unknown[] = "<unknown>";
-    const char *ret = NULL;
-    cuGetErrorName(error, &ret);
-    return ret ? ret : unknown;
-}
-#endif
-
-#ifdef CUBLAS_API_H_
-// cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
-    switch (error) {
-    case CUBLAS_STATUS_SUCCESS:
-        return "CUBLAS_STATUS_SUCCESS";
-
-    case CUBLAS_STATUS_NOT_INITIALIZED:
-        return "CUBLAS_STATUS_NOT_INITIALIZED";
-
-    case CUBLAS_STATUS_ALLOC_FAILED:
-        return "CUBLAS_STATUS_ALLOC_FAILED";
-
-    case CUBLAS_STATUS_INVALID_VALUE:
-        return "CUBLAS_STATUS_INVALID_VALUE";
-
-    case CUBLAS_STATUS_ARCH_MISMATCH:
-        return "CUBLAS_STATUS_ARCH_MISMATCH";
-
-    case CUBLAS_STATUS_MAPPING_ERROR:
-        return "CUBLAS_STATUS_MAPPING_ERROR";
-
-    case CUBLAS_STATUS_EXECUTION_FAILED:
-        return "CUBLAS_STATUS_EXECUTION_FAILED";
-
-    case CUBLAS_STATUS_INTERNAL_ERROR:
-        return "CUBLAS_STATUS_INTERNAL_ERROR";
-
-    case CUBLAS_STATUS_NOT_SUPPORTED:
-        return "CUBLAS_STATUS_NOT_SUPPORTED";
-
-    case CUBLAS_STATUS_LICENSE_ERROR:
-        return "CUBLAS_STATUS_LICENSE_ERROR";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef _CUFFT_H_
-// cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error) {
-    switch (error) {
-    case CUFFT_SUCCESS:
-        return "CUFFT_SUCCESS";
-
-    case CUFFT_INVALID_PLAN:
-        return "CUFFT_INVALID_PLAN";
-
-    case CUFFT_ALLOC_FAILED:
-        return "CUFFT_ALLOC_FAILED";
-
-    case CUFFT_INVALID_TYPE:
-        return "CUFFT_INVALID_TYPE";
-
-    case CUFFT_INVALID_VALUE:
-        return "CUFFT_INVALID_VALUE";
-
-    case CUFFT_INTERNAL_ERROR:
-        return "CUFFT_INTERNAL_ERROR";
-
-    case CUFFT_EXEC_FAILED:
-        return "CUFFT_EXEC_FAILED";
-
-    case CUFFT_SETUP_FAILED:
-        return "CUFFT_SETUP_FAILED";
-
-    case CUFFT_INVALID_SIZE:
-        return "CUFFT_INVALID_SIZE";
-
-    case CUFFT_UNALIGNED_DATA:
-        return "CUFFT_UNALIGNED_DATA";
-
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-
-    case CUFFT_INVALID_DEVICE:
-        return "CUFFT_INVALID_DEVICE";
-
-    case CUFFT_PARSE_ERROR:
-        return "CUFFT_PARSE_ERROR";
-
-    case CUFFT_NO_WORKSPACE:
-        return "CUFFT_NO_WORKSPACE";
-
-    case CUFFT_NOT_IMPLEMENTED:
-        return "CUFFT_NOT_IMPLEMENTED";
-
-    case CUFFT_LICENSE_ERROR:
-        return "CUFFT_LICENSE_ERROR";
-
-    case CUFFT_NOT_SUPPORTED:
-        return "CUFFT_NOT_SUPPORTED";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef CUSPARSEAPI
-// cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
-    switch (error) {
-    case CUSPARSE_STATUS_SUCCESS:
-        return "CUSPARSE_STATUS_SUCCESS";
-
-    case CUSPARSE_STATUS_NOT_INITIALIZED:
-        return "CUSPARSE_STATUS_NOT_INITIALIZED";
-
-    case CUSPARSE_STATUS_ALLOC_FAILED:
-        return "CUSPARSE_STATUS_ALLOC_FAILED";
-
-    case CUSPARSE_STATUS_INVALID_VALUE:
-        return "CUSPARSE_STATUS_INVALID_VALUE";
-
-    case CUSPARSE_STATUS_ARCH_MISMATCH:
-        return "CUSPARSE_STATUS_ARCH_MISMATCH";
-
-    case CUSPARSE_STATUS_MAPPING_ERROR:
-        return "CUSPARSE_STATUS_MAPPING_ERROR";
-
-    case CUSPARSE_STATUS_EXECUTION_FAILED:
-        return "CUSPARSE_STATUS_EXECUTION_FAILED";
-
-    case CUSPARSE_STATUS_INTERNAL_ERROR:
-        return "CUSPARSE_STATUS_INTERNAL_ERROR";
-
-    case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-        return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef CUSOLVER_COMMON_H_
-// cuSOLVER API errors
-static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
-    switch (error) {
-    case CUSOLVER_STATUS_SUCCESS:
-        return "CUSOLVER_STATUS_SUCCESS";
-    case CUSOLVER_STATUS_NOT_INITIALIZED:
-        return "CUSOLVER_STATUS_NOT_INITIALIZED";
-    case CUSOLVER_STATUS_ALLOC_FAILED:
-        return "CUSOLVER_STATUS_ALLOC_FAILED";
-    case CUSOLVER_STATUS_INVALID_VALUE:
-        return "CUSOLVER_STATUS_INVALID_VALUE";
-    case CUSOLVER_STATUS_ARCH_MISMATCH:
-        return "CUSOLVER_STATUS_ARCH_MISMATCH";
-    case CUSOLVER_STATUS_MAPPING_ERROR:
-        return "CUSOLVER_STATUS_MAPPING_ERROR";
-    case CUSOLVER_STATUS_EXECUTION_FAILED:
-        return "CUSOLVER_STATUS_EXECUTION_FAILED";
-    case CUSOLVER_STATUS_INTERNAL_ERROR:
-        return "CUSOLVER_STATUS_INTERNAL_ERROR";
-    case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
-        return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
-    case CUSOLVER_STATUS_NOT_SUPPORTED:
-        return "CUSOLVER_STATUS_NOT_SUPPORTED ";
-    case CUSOLVER_STATUS_ZERO_PIVOT:
-        return "CUSOLVER_STATUS_ZERO_PIVOT";
-    case CUSOLVER_STATUS_INVALID_LICENSE:
-        return "CUSOLVER_STATUS_INVALID_LICENSE";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef CURAND_H_
-// cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error) {
-    switch (error) {
-    case CURAND_STATUS_SUCCESS:
-        return "CURAND_STATUS_SUCCESS";
-
-    case CURAND_STATUS_VERSION_MISMATCH:
-        return "CURAND_STATUS_VERSION_MISMATCH";
-
-    case CURAND_STATUS_NOT_INITIALIZED:
-        return "CURAND_STATUS_NOT_INITIALIZED";
-
-    case CURAND_STATUS_ALLOCATION_FAILED:
-        return "CURAND_STATUS_ALLOCATION_FAILED";
-
-    case CURAND_STATUS_TYPE_ERROR:
-        return "CURAND_STATUS_TYPE_ERROR";
-
-    case CURAND_STATUS_OUT_OF_RANGE:
-        return "CURAND_STATUS_OUT_OF_RANGE";
-
-    case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
-        return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-
-    case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
-        return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
-    case CURAND_STATUS_LAUNCH_FAILURE:
-        return "CURAND_STATUS_LAUNCH_FAILURE";
-
-    case CURAND_STATUS_PREEXISTING_FAILURE:
-        return "CURAND_STATUS_PREEXISTING_FAILURE";
-
-    case CURAND_STATUS_INITIALIZATION_FAILED:
-        return "CURAND_STATUS_INITIALIZATION_FAILED";
-
-    case CURAND_STATUS_ARCH_MISMATCH:
-        return "CURAND_STATUS_ARCH_MISMATCH";
-
-    case CURAND_STATUS_INTERNAL_ERROR:
-        return "CURAND_STATUS_INTERNAL_ERROR";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef NVJPEGAPI
-// nvJPEG API errors
-static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
-    switch (error) {
-    case NVJPEG_STATUS_SUCCESS:
-        return "NVJPEG_STATUS_SUCCESS";
-
-    case NVJPEG_STATUS_NOT_INITIALIZED:
-        return "NVJPEG_STATUS_NOT_INITIALIZED";
-
-    case NVJPEG_STATUS_INVALID_PARAMETER:
-        return "NVJPEG_STATUS_INVALID_PARAMETER";
-
-    case NVJPEG_STATUS_BAD_JPEG:
-        return "NVJPEG_STATUS_BAD_JPEG";
-
-    case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
-        return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
-
-    case NVJPEG_STATUS_ALLOCATOR_FAILURE:
-        return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
-
-    case NVJPEG_STATUS_EXECUTION_FAILED:
-        return "NVJPEG_STATUS_EXECUTION_FAILED";
-
-    case NVJPEG_STATUS_ARCH_MISMATCH:
-        return "NVJPEG_STATUS_ARCH_MISMATCH";
-
-    case NVJPEG_STATUS_INTERNAL_ERROR:
-        return "NVJPEG_STATUS_INTERNAL_ERROR";
-    }
-
-    return "<unknown>";
-}
-#endif
-
-#ifdef NV_NPPIDEFS_H
-// NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error) {
-    switch (error) {
-    case NPP_NOT_SUPPORTED_MODE_ERROR:
-        return "NPP_NOT_SUPPORTED_MODE_ERROR";
-
-    case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
-        return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
-
-    case NPP_RESIZE_NO_OPERATION_ERROR:
-        return "NPP_RESIZE_NO_OPERATION_ERROR";
-
-    case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
-        return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-    case NPP_BAD_ARG_ERROR:
-        return "NPP_BAD_ARGUMENT_ERROR";
-
-    case NPP_COEFF_ERROR:
-        return "NPP_COEFFICIENT_ERROR";
-
-    case NPP_RECT_ERROR:
-        return "NPP_RECTANGLE_ERROR";
-
-    case NPP_QUAD_ERROR:
-        return "NPP_QUADRANGLE_ERROR";
-
-    case NPP_MEM_ALLOC_ERR:
-        return "NPP_MEMORY_ALLOCATION_ERROR";
-
-    case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
-        return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_INVALID_INPUT:
-        return "NPP_INVALID_INPUT";
-
-    case NPP_POINTER_ERROR:
-        return "NPP_POINTER_ERROR";
-
-    case NPP_WARNING:
-        return "NPP_WARNING";
-
-    case NPP_ODD_ROI_WARNING:
-        return "NPP_ODD_ROI_WARNING";
-#else
-
-    // These are for CUDA 5.5 or higher
-    case NPP_BAD_ARGUMENT_ERROR:
-        return "NPP_BAD_ARGUMENT_ERROR";
-
-    case NPP_COEFFICIENT_ERROR:
-        return "NPP_COEFFICIENT_ERROR";
-
-    case NPP_RECTANGLE_ERROR:
-        return "NPP_RECTANGLE_ERROR";
-
-    case NPP_QUADRANGLE_ERROR:
-        return "NPP_QUADRANGLE_ERROR";
-
-    case NPP_MEMORY_ALLOCATION_ERR:
-        return "NPP_MEMORY_ALLOCATION_ERROR";
-
-    case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
-        return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_INVALID_HOST_POINTER_ERROR:
-        return "NPP_INVALID_HOST_POINTER_ERROR";
-
-    case NPP_INVALID_DEVICE_POINTER_ERROR:
-        return "NPP_INVALID_DEVICE_POINTER_ERROR";
-#endif
-
-    case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
-        return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
-
-    case NPP_TEXTURE_BIND_ERROR:
-        return "NPP_TEXTURE_BIND_ERROR";
-
-    case NPP_WRONG_INTERSECTION_ROI_ERROR:
-        return "NPP_WRONG_INTERSECTION_ROI_ERROR";
-
-    case NPP_NOT_EVEN_STEP_ERROR:
-        return "NPP_NOT_EVEN_STEP_ERROR";
-
-    case NPP_INTERPOLATION_ERROR:
-        return "NPP_INTERPOLATION_ERROR";
-
-    case NPP_RESIZE_FACTOR_ERROR:
-        return "NPP_RESIZE_FACTOR_ERROR";
-
-    case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
-        return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
-    case NPP_MEMFREE_ERR:
-        return "NPP_MEMFREE_ERR";
-
-    case NPP_MEMSET_ERR:
-        return "NPP_MEMSET_ERR";
-
-    case NPP_MEMCPY_ERR:
-        return "NPP_MEMCPY_ERROR";
-
-    case NPP_MIRROR_FLIP_ERR:
-        return "NPP_MIRROR_FLIP_ERR";
-#else
-
-    case NPP_MEMFREE_ERROR:
-        return "NPP_MEMFREE_ERROR";
-
-    case NPP_MEMSET_ERROR:
-        return "NPP_MEMSET_ERROR";
-
-    case NPP_MEMCPY_ERROR:
-        return "NPP_MEMCPY_ERROR";
-
-    case NPP_MIRROR_FLIP_ERROR:
-        return "NPP_MIRROR_FLIP_ERROR";
-#endif
-
-    case NPP_ALIGNMENT_ERROR:
-        return "NPP_ALIGNMENT_ERROR";
-
-    case NPP_STEP_ERROR:
-        return "NPP_STEP_ERROR";
-
-    case NPP_SIZE_ERROR:
-        return "NPP_SIZE_ERROR";
-
-    case NPP_NULL_POINTER_ERROR:
-        return "NPP_NULL_POINTER_ERROR";
-
-    case NPP_CUDA_KERNEL_EXECUTION_ERROR:
-        return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
-
-    case NPP_NOT_IMPLEMENTED_ERROR:
-        return "NPP_NOT_IMPLEMENTED_ERROR";
-
-    case NPP_ERROR:
-        return "NPP_ERROR";
-
-    case NPP_SUCCESS:
-        return "NPP_SUCCESS";
-
-    case NPP_WRONG_INTERSECTION_QUAD_WARNING:
-        return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
-
-    case NPP_MISALIGNED_DST_ROI_WARNING:
-        return "NPP_MISALIGNED_DST_ROI_WARNING";
-
-    case NPP_AFFINE_QUAD_INCORRECT_WARNING:
-        return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
-
-    case NPP_DOUBLE_SIZE_WARNING:
-        return "NPP_DOUBLE_SIZE_WARNING";
-
-    case NPP_WRONG_INTERSECTION_ROI_WARNING:
-        return "NPP_WRONG_INTERSECTION_ROI_WARNING";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
-    /* These are 6.0 or higher */
-    case NPP_LUT_PALETTE_BITSIZE_ERROR:
-        return "NPP_LUT_PALETTE_BITSIZE_ERROR";
-
-    case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
-        return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
-
-    case NPP_QUALITY_INDEX_ERROR:
-        return "NPP_QUALITY_INDEX_ERROR";
-
-    case NPP_CHANNEL_ORDER_ERROR:
-        return "NPP_CHANNEL_ORDER_ERROR";
-
-    case NPP_ZERO_MASK_VALUE_ERROR:
-        return "NPP_ZERO_MASK_VALUE_ERROR";
-
-    case NPP_NUMBER_OF_CHANNELS_ERROR:
-        return "NPP_NUMBER_OF_CHANNELS_ERROR";
-
-    case NPP_COI_ERROR:
-        return "NPP_COI_ERROR";
-
-    case NPP_DIVISOR_ERROR:
-        return "NPP_DIVISOR_ERROR";
-
-    case NPP_CHANNEL_ERROR:
-        return "NPP_CHANNEL_ERROR";
-
-    case NPP_STRIDE_ERROR:
-        return "NPP_STRIDE_ERROR";
-
-    case NPP_ANCHOR_ERROR:
-        return "NPP_ANCHOR_ERROR";
-
-    case NPP_MASK_SIZE_ERROR:
-        return "NPP_MASK_SIZE_ERROR";
-
-    case NPP_MOMENT_00_ZERO_ERROR:
-        return "NPP_MOMENT_00_ZERO_ERROR";
-
-    case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
-        return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
-
-    case NPP_THRESHOLD_ERROR:
-        return "NPP_THRESHOLD_ERROR";
-
-    case NPP_CONTEXT_MATCH_ERROR:
-        return "NPP_CONTEXT_MATCH_ERROR";
-
-    case NPP_FFT_FLAG_ERROR:
-        return "NPP_FFT_FLAG_ERROR";
-
-    case NPP_FFT_ORDER_ERROR:
-        return "NPP_FFT_ORDER_ERROR";
-
-    case NPP_SCALE_RANGE_ERROR:
-        return "NPP_SCALE_RANGE_ERROR";
-
-    case NPP_DATA_TYPE_ERROR:
-        return "NPP_DATA_TYPE_ERROR";
-
-    case NPP_OUT_OFF_RANGE_ERROR:
-        return "NPP_OUT_OFF_RANGE_ERROR";
-
-    case NPP_DIVIDE_BY_ZERO_ERROR:
-        return "NPP_DIVIDE_BY_ZERO_ERROR";
-
-    case NPP_RANGE_ERROR:
-        return "NPP_RANGE_ERROR";
-
-    case NPP_NO_MEMORY_ERROR:
-        return "NPP_NO_MEMORY_ERROR";
-
-    case NPP_ERROR_RESERVED:
-        return "NPP_ERROR_RESERVED";
-
-    case NPP_NO_OPERATION_WARNING:
-        return "NPP_NO_OPERATION_WARNING";
-
-    case NPP_DIVIDE_BY_ZERO_WARNING:
-        return "NPP_DIVIDE_BY_ZERO_WARNING";
-#endif
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
-    /* These are 7.0 or higher */
-    case NPP_OVERFLOW_ERROR:
-        return "NPP_OVERFLOW_ERROR";
-
-    case NPP_CORRUPTED_DATA_ERROR:
-        return "NPP_CORRUPTED_DATA_ERROR";
-#endif
-    }
-
-    return "<unknown>";
-}
-#endif
-
-template <typename T>
-void check(T result, char const *const func, const char *const file, int const line) {
-    if (result) {
-        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
-                _cudaGetErrorEnum(result), func);
-        exit(EXIT_FAILURE);
-    }
-}
-
-#ifdef __DRIVER_TYPES_H__
-// This will output the proper CUDA error strings in the event
-// that a CUDA host call returns an error
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) {
-    cudaError_t err = cudaGetLastError();
-
-    if (cudaSuccess != err) {
-        fprintf(stderr,
-                "%s(%i) : getLastCudaError() CUDA error :"
-                " %s : (%d) %s.\n",
-                file, line, errorMessage, static_cast<int>(err), cudaGetErrorString(err));
-        exit(EXIT_FAILURE);
-    }
-}
-
-// This will only print the proper error string when calling cudaGetLastError
-// but not exit program incase error detected.
-#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __printLastCudaError(const char *errorMessage, const char *file, const int line) {
-    cudaError_t err = cudaGetLastError();
-
-    if (cudaSuccess != err) {
-        fprintf(stderr,
-                "%s(%i) : getLastCudaError() CUDA error :"
-                " %s : (%d) %s.\n",
-                file, line, errorMessage, static_cast<int>(err), cudaGetErrorString(err));
-    }
-}
-#endif
-
-#ifndef MAX
-#define MAX(a, b) (a > b ? a : b)
-#endif
-
-// Float To Int conversion
-inline int ftoi(float value) { return (value >= 0 ? static_cast<int>(value + 0.5) : static_cast<int>(value - 0.5)); }
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores(int major, int minor) {
-    // Defines for GPU Architecture types (using the SM version to determine
-    // the # of cores per SM
-    typedef struct {
-        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
-        // and m = SM minor version
-        int Cores;
-    } sSMtoCores;
-
-    sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
-                                       {0x52, 128}, {0x53, 128}, {0x60, 64},  {0x61, 128}, {0x62, 128},
-                                       {0x70, 64},  {0x72, 64},  {0x75, 64},  {-1, -1}};
-
-    int index = 0;
-
-    while (nGpuArchCoresPerSM[index].SM != -1) {
-        if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
-            return nGpuArchCoresPerSM[index].Cores;
-        }
-
-        index++;
-    }
-
-    // If we don't find the values, we default use the previous one
-    // to run properly
-    printf("MapSMtoCores for SM %d.%d is undefined."
-           "  Default to use %d Cores/SM\n",
-           major, minor, nGpuArchCoresPerSM[index - 1].Cores);
-    return nGpuArchCoresPerSM[index - 1].Cores;
-}
-
-inline const char *_ConvertSMVer2ArchName(int major, int minor) {
-    // Defines for GPU Architecture types (using the SM version to determine
-    // the GPU Arch name)
-    typedef struct {
-        int SM; // 0xMm (hexidecimal notation), M = SM Major version,
-        // and m = SM minor version
-        const char *name;
-    } sSMtoArchName;
-
-    sSMtoArchName nGpuArchNameSM[] = {{0x30, "Kepler"},  {0x32, "Kepler"},       {0x35, "Kepler"},  {0x37, "Kepler"},
-                                      {0x50, "Maxwell"}, {0x52, "Maxwell"},      {0x53, "Maxwell"}, {0x60, "Pascal"},
-                                      {0x61, "Pascal"},  {0x62, "Pascal"},       {0x70, "Volta"},   {0x72, "Xavier"},
-                                      {0x75, "Turing"},  {-1, "Graphics Device"}};
-
-    int index = 0;
-
-    while (nGpuArchNameSM[index].SM != -1) {
-        if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
-            return nGpuArchNameSM[index].name;
-        }
-
-        index++;
-    }
-
-    // If we don't find the values, we default use the previous one
-    // to run properly
-    printf("MapSMtoArchName for SM %d.%d is undefined."
-           "  Default to use %s\n",
-           major, minor, nGpuArchNameSM[index - 1].name);
-    return nGpuArchNameSM[index - 1].name;
-}
-// end of GPU Architecture definitions
-
-#ifdef __CUDA_RUNTIME_H__
-// General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID) {
-    int device_count;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0) {
-        fprintf(stderr, "gpuDeviceInit() CUDA error: "
-                        "no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (devID < 0) {
-        devID = 0;
-    }
-
-    if (devID > device_count - 1) {
-        fprintf(stderr, "\n");
-        fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
-        fprintf(stderr,
-                ">> gpuDeviceInit (-device=%d) is not a valid"
-                " GPU device. <<\n",
-                devID);
-        fprintf(stderr, "\n");
-        return -devID;
-    }
-
-    int computeMode = -1, major = 0, minor = 0;
-    checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-    if (computeMode == cudaComputeModeProhibited) {
-        fprintf(stderr, "Error: device is running in <Compute Mode "
-                        "Prohibited>, no threads can use cudaSetDevice().\n");
-        return -1;
-    }
-
-    if (major < 1) {
-        fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    checkCudaErrors(cudaSetDevice(devID));
-    printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
-
-    return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int gpuGetMaxGflopsDeviceId() {
-    int current_device = 0, sm_per_multiproc = 0;
-    int max_perf_device = 0;
-    int device_count = 0;
-    int devices_prohibited = 0;
-
-    uint64_t max_compute_perf = 0;
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0) {
-        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
-                        " no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    // Find the best CUDA capable GPU device
-    current_device = 0;
-
-    while (current_device < device_count) {
-        int computeMode = -1, major = 0, minor = 0;
-        checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-        checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-        checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
-
-        // If this GPU is not running on Compute Mode prohibited,
-        // then we can add it to the list
-        if (computeMode != cudaComputeModeProhibited) {
-            if (major == 9999 && minor == 9999) {
-                sm_per_multiproc = 1;
-            } else {
-                sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
-            }
-            int multiProcessorCount = 0, clockRate = 0;
-            checkCudaErrors(
-                cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
-            checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device));
-            uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
-
-            if (compute_perf > max_compute_perf) {
-                max_compute_perf = compute_perf;
-                max_perf_device = current_device;
-            }
-        } else {
-            devices_prohibited++;
-        }
-
-        ++current_device;
-    }
-
-    if (devices_prohibited == device_count) {
-        fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
-                        " all devices have compute mode prohibited.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return max_perf_device;
-}
-
-// Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv) {
-    int devID = 0;
-
-    // If the command-line has a device number specified, use it
-    if (checkCmdLineFlag(argc, argv, "device")) {
-        devID = getCmdLineArgumentInt(argc, argv, "device=");
-
-        if (devID < 0) {
-            printf("Invalid command line parameter\n ");
-            exit(EXIT_FAILURE);
-        } else {
-            devID = gpuDeviceInit(devID);
-
-            if (devID < 0) {
-                printf("exiting...\n");
-                exit(EXIT_FAILURE);
-            }
-        }
-    } else {
-        // Otherwise pick the device with highest Gflops/s
-        devID = gpuGetMaxGflopsDeviceId();
-        checkCudaErrors(cudaSetDevice(devID));
-        int major = 0, minor = 0;
-        checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
-        checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
-        printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, _ConvertSMVer2ArchName(major, minor),
-               major, minor);
-    }
-
-    return devID;
-}
-
-inline int findIntegratedGPU() {
-    int current_device = 0;
-    int device_count = 0;
-    int devices_prohibited = 0;
-
-    checkCudaErrors(cudaGetDeviceCount(&device_count));
-
-    if (device_count == 0) {
-        fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    // Find the integrated GPU which is compute capable
-    while (current_device < device_count) {
-        int computeMode = -1, integrated = -1;
-        checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
-        checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
-        // If GPU is integrated and is not running on Compute Mode prohibited,
-        // then cuda can map to GLES resource
-        if (integrated && (computeMode != cudaComputeModeProhibited)) {
-            checkCudaErrors(cudaSetDevice(current_device));
-
-            int major = 0, minor = 0;
-            checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
-            checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
-            printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device,
-                   _ConvertSMVer2ArchName(major, minor), major, minor);
-
-            return current_device;
-        } else {
-            devices_prohibited++;
-        }
-
-        current_device++;
-    }
-
-    if (devices_prohibited == device_count) {
-        fprintf(stderr, "CUDA error:"
-                        " No GLES-CUDA Interop capable GPU found.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    return -1;
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilities(int major_version, int minor_version) {
-    int dev;
-    int major = 0, minor = 0;
-
-    checkCudaErrors(cudaGetDevice(&dev));
-    checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
-    checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
-
-    if ((major > major_version) || (major == major_version && minor >= minor_version)) {
-        printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev, _ConvertSMVer2ArchName(major, minor), major,
-               minor);
-        return true;
-    } else {
-        printf("  No GPU device was found that can support "
-               "CUDA compute capability %d.%d.\n",
-               major_version, minor_version);
-        return false;
-    }
-}
-#endif
-
-// end of CUDA Helper Functions
-
-#endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/contrib/cuda_samples/helper_string.h b/include/cufinufft/contrib/cuda_samples/helper_string.h
deleted file mode 100644
index 9dfaa28d6..000000000
--- a/include/cufinufft/contrib/cuda_samples/helper_string.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// These are helper functions for the SDK samples (string parsing, timers, etc)
-#ifndef COMMON_HELPER_STRING_H_
-#define COMMON_HELPER_STRING_H_
-
-#include <fstream>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string>
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_DEPRECATE
-#endif
-#ifndef STRCASECMP
-#define STRCASECMP _stricmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP _strnicmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf_s
-#endif
-#else // Linux Includes
-#include <string.h>
-#include <strings.h>
-
-#ifndef STRCASECMP
-#define STRCASECMP strcasecmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP strncasecmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf
-#endif
-#endif
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string) {
-    int string_start = 0;
-
-    while (string[string_start] == delimiter) {
-        string_start++;
-    }
-
-    if (string_start >= static_cast<int>(strlen(string) - 1)) {
-        return 0;
-    }
-
-    return string_start;
-}
-
-inline int getFileExtension(char *filename, char **extension) {
-    int string_length = static_cast<int>(strlen(filename));
-
-    while (filename[string_length--] != '.') {
-        if (string_length == 0)
-            break;
-    }
-
-    if (string_length > 0)
-        string_length += 2;
-
-    if (string_length == 0)
-        *extension = NULL;
-    else
-        *extension = &filename[string_length];
-
-    return string_length;
-}
-
-inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) {
-    bool bFound = false;
-
-    if (argc >= 1) {
-        for (int i = 1; i < argc; i++) {
-            int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-
-            const char *equal_pos = strchr(string_argv, '=');
-            int argv_length = static_cast<int>(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
-
-            int length = static_cast<int>(strlen(string_ref));
-
-            if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) {
-                bFound = true;
-                continue;
-            }
-        }
-    }
-
-    return bFound;
-}
-
-// This function wraps the CUDA Driver API into a template function
-template <class T>
-inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value) {
-    bool bFound = false;
-
-    if (argc >= 1) {
-        for (int i = 1; i < argc; i++) {
-            int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = static_cast<int>(strlen(string_ref));
-
-            if (!STRNCASECMP(string_argv, string_ref, length)) {
-                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-                    *value = (T)atoi(&string_argv[length + auto_inc]);
-                }
-
-                bFound = true;
-                i = argc;
-            }
-        }
-    }
-
-    return bFound;
-}
-
-inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) {
-    bool bFound = false;
-    int value = -1;
-
-    if (argc >= 1) {
-        for (int i = 1; i < argc; i++) {
-            int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = static_cast<int>(strlen(string_ref));
-
-            if (!STRNCASECMP(string_argv, string_ref, length)) {
-                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-                    value = atoi(&string_argv[length + auto_inc]);
-                } else {
-                    value = 0;
-                }
-
-                bFound = true;
-                continue;
-            }
-        }
-    }
-
-    if (bFound) {
-        return value;
-    } else {
-        return 0;
-    }
-}
-
-inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) {
-    bool bFound = false;
-    float value = -1;
-
-    if (argc >= 1) {
-        for (int i = 1; i < argc; i++) {
-            int string_start = stringRemoveDelimiter('-', argv[i]);
-            const char *string_argv = &argv[i][string_start];
-            int length = static_cast<int>(strlen(string_ref));
-
-            if (!STRNCASECMP(string_argv, string_ref, length)) {
-                if (length + 1 <= static_cast<int>(strlen(string_argv))) {
-                    int auto_inc = (string_argv[length] == '=') ? 1 : 0;
-                    value = static_cast<float>(atof(&string_argv[length + auto_inc]));
-                } else {
-                    value = 0.f;
-                }
-
-                bFound = true;
-                continue;
-            }
-        }
-    }
-
-    if (bFound) {
-        return value;
-    } else {
-        return 0;
-    }
-}
-
-inline bool getCmdLineArgumentString(const int argc, const char **argv, const char *string_ref, char **string_retval) {
-    bool bFound = false;
-
-    if (argc >= 1) {
-        for (int i = 1; i < argc; i++) {
-            int string_start = stringRemoveDelimiter('-', argv[i]);
-            char *string_argv = const_cast<char *>(&argv[i][string_start]);
-            int length = static_cast<int>(strlen(string_ref));
-
-            if (!STRNCASECMP(string_argv, string_ref, length)) {
-                *string_retval = &string_argv[length + 1];
-                bFound = true;
-                continue;
-            }
-        }
-    }
-
-    if (!bFound) {
-        *string_retval = NULL;
-    }
-
-    return bFound;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Find the path for a file assuming that
-//! files are found in the searchPath.
-//!
-//! @return the path if succeeded, otherwise 0
-//! @param filename         name of the file
-//! @param executable_path  optional absolute path of the executable
-//////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename, const char *executable_path) {
-    // <executable_name> defines a variable that is replaced with the name of the
-    // executable
-
-    // Typical relative search paths to locate needed companion files (e.g. sample
-    // input data, or JIT source files) The origin for the relative search may be
-    // the .exe file, a .bat file launching an .exe, a browser .exe launching the
-    // .exe or .bat, etc
-    const char *searchPath[] = {
-        "./",                                          // same dir
-        "./data/",                                     // same dir
-        "../../../../Samples/<executable_name>/",      // up 4 in tree
-        "../../../Samples/<executable_name>/",         // up 3 in tree
-        "../../Samples/<executable_name>/",            // up 2 in tree
-        "../../../../Samples/<executable_name>/data/", // up 4 in tree
-        "../../../Samples/<executable_name>/data/",    // up 3 in tree
-        "../../Samples/<executable_name>/data/",       // up 2 in tree
-    };
-
-    // Extract the executable name
-    std::string executable_name;
-
-    if (executable_path != 0) {
-        executable_name = std::string(executable_path);
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-        // Windows path delimiter
-        size_t delimiter_pos = executable_name.find_last_of('\\');
-        executable_name.erase(0, delimiter_pos + 1);
-
-        if (executable_name.rfind(".exe") != std::string::npos) {
-            // we strip .exe, only if the .exe is found
-            executable_name.resize(executable_name.size() - 4);
-        }
-
-#else
-        // Linux & OSX path delimiter
-        size_t delimiter_pos = executable_name.find_last_of('/');
-        executable_name.erase(0, delimiter_pos + 1);
-#endif
-    }
-
-    // Loop over all search paths and return the first hit
-    for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
-        std::string path(searchPath[i]);
-        size_t executable_name_pos = path.find("<executable_name>");
-
-        // If there is executable_name variable in the searchPath
-        // replace it with the value
-        if (executable_name_pos != std::string::npos) {
-            if (executable_path != 0) {
-                path.replace(executable_name_pos, strlen("<executable_name>"), executable_name);
-            } else {
-                // Skip this path entry if no executable argument is given
-                continue;
-            }
-        }
-
-#ifdef _DEBUG
-        printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
-#endif
-
-        // Test if the file exists
-        path.append(filename);
-        FILE *fp;
-        FOPEN(fp, path.c_str(), "rb");
-
-        if (fp != NULL) {
-            fclose(fp);
-            // File found
-            // returning an allocated array here for backwards compatibility reasons
-            char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
-            STRCPY(file_path, path.length() + 1, path.c_str());
-            return file_path;
-        }
-
-        if (fp) {
-            fclose(fp);
-        }
-    }
-
-    // File not found
-    return 0;
-}
-
-#endif // COMMON_HELPER_STRING_H_
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
new file mode 100644
index 000000000..fe4010393
--- /dev/null
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <finufft_errors.h>
+
+#include <cufft.h>
+
+static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+#define RETURN_IF_CUDA_ERROR                                                                                           \
+    {                                                                                                                  \
+        cudaError_t err = cudaGetLastError();                                                                          \
+        if (err != cudaSuccess) {                                                                                      \
+            printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err));                                             \
+            return FINUFFT_ERR_CUDA_FAILURE;                                                                           \
+        }                                                                                                              \
+    }
+
+#define CUDA_FREE_AND_NULL(val)                                                                                        \
+    {                                                                                                                  \
+        check(cudaFree(val), #val, __FILE__, __LINE__);                                                                \
+        val = nullptr;                                                                                                 \
+    }
+
+static const char *cufftGetErrorString(cufftResult error) {
+    switch (error) {
+    case CUFFT_SUCCESS:
+        return "CUFFT_SUCCESS";
+
+    case CUFFT_INVALID_PLAN:
+        return "CUFFT_INVALID_PLAN";
+
+    case CUFFT_ALLOC_FAILED:
+        return "CUFFT_ALLOC_FAILED";
+
+    case CUFFT_INVALID_TYPE:
+        return "CUFFT_INVALID_TYPE";
+
+    case CUFFT_INVALID_VALUE:
+        return "CUFFT_INVALID_VALUE";
+
+    case CUFFT_INTERNAL_ERROR:
+        return "CUFFT_INTERNAL_ERROR";
+
+    case CUFFT_EXEC_FAILED:
+        return "CUFFT_EXEC_FAILED";
+
+    case CUFFT_SETUP_FAILED:
+        return "CUFFT_SETUP_FAILED";
+
+    case CUFFT_INVALID_SIZE:
+        return "CUFFT_INVALID_SIZE";
+
+    case CUFFT_UNALIGNED_DATA:
+        return "CUFFT_UNALIGNED_DATA";
+
+    case CUFFT_INCOMPLETE_PARAMETER_LIST:
+        return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+    case CUFFT_INVALID_DEVICE:
+        return "CUFFT_INVALID_DEVICE";
+
+    case CUFFT_PARSE_ERROR:
+        return "CUFFT_PARSE_ERROR";
+
+    case CUFFT_NO_WORKSPACE:
+        return "CUFFT_NO_WORKSPACE";
+
+    case CUFFT_NOT_IMPLEMENTED:
+        return "CUFFT_NOT_IMPLEMENTED";
+
+    case CUFFT_LICENSE_ERROR:
+        return "CUFFT_LICENSE_ERROR";
+
+    case CUFFT_NOT_SUPPORTED:
+        return "CUFFT_NOT_SUPPORTED";
+    }
+
+    return "<unknown>";
+}
+
+template <typename T>
+int check(T result, char const *const func, const char *const file, int const line) {
+    if (result) {
+        fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast<unsigned int>(result),
+                _cudaGetErrorEnum(result), func);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
+    return 0;
+}
+
+#endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index 64b237759..6cdb84340 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -13,17 +13,6 @@
 // FIXME: If cufft ever takes N > INT_MAX...
 constexpr int32_t MAX_NF = std::numeric_limits<int32_t>::max();
 
-// Global error codes for the library...
-#define WARN_EPS_TOO_SMALL 1
-#define ERR_MAXNALLOC 2
-#define ERR_SPREAD_BOX_SMALL 3
-#define ERR_SPREAD_PTS_OUT_RANGE 4
-#define ERR_SPREAD_ALLOC 5
-#define ERR_SPREAD_DIR 6
-#define ERR_UPSAMPFAC_TOO_SMALL 7
-#define HORNER_WRONG_BETA 8
-#define ERR_NDATA_NOTVALID 9
-
 // allow compile-time switch off of openmp, so compilation without any openmp
 // is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
 #ifdef _OPENMP
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index c508cb69f..10c459961 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -2,11 +2,10 @@
 #define CUFINUFFT_IMPL_H
 
 #include <iostream>
-
-#include <helper_cuda.h>
 #include <thrust/device_ptr.h>    
 
 
+#include <cufinufft/contrib/helper_cuda.h>
 #include <cufinufft/common.h>
 #include <cufinufft/cudeconvolve.h>
 #include <cufinufft/defs.h>
@@ -15,6 +14,8 @@
 #include <cufinufft/types.h>
 #include <cufinufft/utils.h>
 
+#include <finufft_errors.h>
+
 // 1d
 template <typename T>
 int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_plan_t<T> *d_plan);
@@ -88,18 +89,22 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
 
         Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
     */
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    if (opts == NULL) {
-        // options might not be supplied to this function => assume device
-        // 0 by default
-        cudaSetDevice(0);
-    } else {
-        cudaSetDevice(opts->gpu_device_id);
+    int ier;
+    cuDoubleComplex *d_a = nullptr; // fseries temp data
+    T *d_f = nullptr;               // fseries temp data
+
+    if (type < 1 || type > 2) {
+        fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+        return FINUFFT_ERR_TYPE_NOTVALID;
+    }
+    if (ntransf < 1) {
+        fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, ntransf);
+        return FINUFFT_ERR_NTRANS_NOTVALID;
     }
 
-    int ier;
+    // Mult-GPU support: set the CUDA Device ID:
+    const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
+    cufinufft::utils::WithCudaDevice device_swapper(device_id);
 
     /* allocate the plan structure, assign address to user pointer. */
     cufinufft_plan_t<T> *d_plan = new cufinufft_plan_t<T>;
@@ -109,20 +114,36 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
 
     /* If a user has not supplied their own options, assign defaults for them. */
     if (opts == NULL) { // use default opts
-        ier = cufinufft_default_opts(type, dim, &(d_plan->opts));
-        if (ier != 0) {
-            printf("error: cufinufft_default_opts returned error %d.\n", ier);
-            return ier;
-        }
+        cufinufft_default_opts(&(d_plan->opts));
     } else {                  // or read from what's passed in
         d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
     }
 
+    /* Automatically set GPU method. */
+    if (d_plan->opts.gpu_method == 0) {
+        /* For type 1, we default to method 2 (SM) since this is generally faster.
+         * However, in the special case of _double precision_ in _three dimensions_
+         * with more than _three digits of precision_, there is note enough shared
+         * memory for this to work. As a result, we will default to method 1 (GM) in
+         * this special case.
+         *
+         * For type 2, we always default to method 1 (GM). */
+        if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
+            d_plan->opts.gpu_method = 2;
+        else if (type == 1 && tol < 1e-3)
+            d_plan->opts.gpu_method = 1;
+        else if (type == 2)
+            d_plan->opts.gpu_method = 1;
+    }
+
     /* Setup Spreader */
     using namespace cufinufft::common;
-    ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts);
-    if (ier > 1) // proceed if success or warning
+    // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
+    if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
+        delete *d_plan_ptr;
+        *d_plan_ptr = nullptr;
         return ier;
+    }
 
     d_plan->dim = dim;
     d_plan->ms = nmodes[0];
@@ -157,64 +178,82 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     using namespace cufinufft::memtransfer;
     switch (d_plan->dim) {
     case 1: {
-        ier = allocgpumem1d_plan<T>(d_plan);
+        if ((ier = allocgpumem1d_plan<T>(d_plan)))
+            goto finalize;
     } break;
     case 2: {
-        ier = allocgpumem2d_plan<T>(d_plan);
+        if ((ier = allocgpumem2d_plan<T>(d_plan)))
+            goto finalize;
     } break;
     case 3: {
-        ier = allocgpumem3d_plan<T>(d_plan);
+        if ((ier = allocgpumem3d_plan<T>(d_plan)))
+            goto finalize;
     } break;
     }
 
     cufftHandle fftplan;
+    cufftResult_t cufft_status;
     switch (d_plan->dim) {
     case 1: {
         int n[] = {(int)nf1};
         int inembed[] = {(int)nf1};
 
-        cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type<T>(), maxbatchsize);
+        cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type<T>(),
+                                     maxbatchsize);
     } break;
     case 2: {
         int n[] = {(int)nf2, (int)nf1};
         int inembed[] = {(int)nf2, (int)nf1};
 
-        cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, inembed[0] * inembed[1],
-                      cufft_type<T>(), maxbatchsize);
+        cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+                                     inembed[0] * inembed[1], cufft_type<T>(), maxbatchsize);
     } break;
     case 3: {
         int n[] = {(int)nf3, (int)nf2, (int)nf1};
         int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
 
-        cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
-                      inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
+        cufft_status = cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+                                     inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), maxbatchsize);
     } break;
     }
-    d_plan->fftplan = fftplan;
 
-    std::complex<double> a[3 * MAX_NQUAD];
-    T f[3 * MAX_NQUAD];
-    onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
-    if (dim > 1) {
-        onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
+    if (cufft_status != CUFFT_SUCCESS) {
+        fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, cufftGetErrorString(cufft_status));
+        ier = FINUFFT_ERR_CUDA_FAILURE;
+        goto finalize;
     }
-    if (dim > 2) {
-        onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
+    d_plan->fftplan = fftplan;
+    {
+        std::complex<double> a[3 * MAX_NQUAD];
+        T f[3 * MAX_NQUAD];
+        onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
+        if (dim > 1)
+            onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
+        if (dim > 2)
+            onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
+
+        if ((ier = checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(
+                 cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice))))
+            goto finalize;
+        if ((ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+                                          d_plan->fwkerhalf3, d_plan->spopts.nspread)))
+            goto finalize;
     }
 
-    cuDoubleComplex *d_a;
-    T *d_f;
-    checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
-    checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
-    checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
-    ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
-                                 d_plan->fwkerhalf3, d_plan->spopts.nspread);
-
+finalize:
     cudaFree(d_a);
     cudaFree(d_f);
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+
+    if (ier > 1) {
+        delete *d_plan_ptr;
+        *d_plan_ptr = nullptr;
+    }
 
     return ier;
 }
@@ -258,10 +297,7 @@ Notes: the type T means either single or double, matching the
     Melody Shih 07/25/19; Barnett 2/16/21 moved out docs.
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
 
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
@@ -283,6 +319,8 @@ Notes: the type T means either single or double, matching the
         ier = allocgpumem3d_nupts<T>(d_plan);
     } break;
     }
+    if (ier)
+        return ier;
 
     d_plan->kx = d_kx;
     if (dim > 1)
@@ -293,93 +331,28 @@ Notes: the type T means either single or double, matching the
     using namespace cufinufft::spreadinterp;
     switch (d_plan->dim) {
     case 1: {
-        if (d_plan->opts.gpu_method == 1) {
-            ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return 1;
-            }
-        }
-        if (d_plan->opts.gpu_method == 2) {
-            ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return 1;
-            }
-        }
+        if (d_plan->opts.gpu_method == 1 && (ier = cuspread1d_nuptsdriven_prop<T>(nf1, M, d_plan)))
+            fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
+        if (d_plan->opts.gpu_method == 2 && (ier = cuspread1d_subprob_prop<T>(nf1, M, d_plan)))
+            fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
     } break;
     case 2: {
-        if (d_plan->opts.gpu_method == 1) {
-            ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return 1;
-            }
-        }
-        if (d_plan->opts.gpu_method == 2) {
-            ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return 1;
-            }
-        }
+        if (d_plan->opts.gpu_method == 1 && (ier = cuspread2d_nuptsdriven_prop<T>(nf1, nf2, M, d_plan)))
+            fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
+        if (d_plan->opts.gpu_method == 2 && (ier = cuspread2d_subprob_prop<T>(nf1, nf2, M, d_plan)))
+            fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
     } break;
     case 3: {
-        if (d_plan->opts.gpu_method == 4) {
-            int ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return ier;
-            }
-        }
-        if (d_plan->opts.gpu_method == 1) {
-            ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return ier;
-            }
-        }
-        if (d_plan->opts.gpu_method == 2) {
-            int ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan);
-            if (ier != 0) {
-                printf("error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
-                // Multi-GPU support: reset the device ID
-                cudaSetDevice(orig_gpu_device_id);
-
-                return ier;
-            }
-        }
+        if (d_plan->opts.gpu_method == 1 && (ier = cuspread3d_nuptsdriven_prop<T>(nf1, nf2, nf3, M, d_plan)))
+            fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
+        if (d_plan->opts.gpu_method == 2 && (ier = cuspread3d_subprob_prop<T>(nf1, nf2, nf3, M, d_plan)))
+            fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
+        if (d_plan->opts.gpu_method == 4 && (ier = cuspread3d_blockgather_prop<T>(nf1, nf2, nf3, M, d_plan)))
+            fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
     } break;
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
-    return 0;
+    return ier;
 }
 
 template<typename T>
@@ -514,11 +487,7 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
     Melody Shih 07/25/19; Barnett 2/16/21.
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
+    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
     int ier;
     int type = d_plan->type;
     switch (d_plan->dim) {
@@ -529,7 +498,7 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
             ier = cufinufft1d2_exec<T>(d_c, d_fk, d_plan);
         if (type == 3) {
             std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
+            ier = FINUFFT_ERR_TYPE_NOTVALID;
         }
     } break;
     case 2: {
@@ -539,7 +508,7 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
             ier = cufinufft2d2_exec<T>(d_c, d_fk, d_plan);
         if (type == 3) {
             std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
+            ier = FINUFFT_ERR_TYPE_NOTVALID;
         }
     } break;
     case 3: {
@@ -549,14 +518,11 @@ int cufinufft_execute_impl(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinuff
             ier = cufinufft3d2_exec<T>(d_c, d_fk, d_plan);
         if (type == 3) {
             std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
+            ier = FINUFFT_ERR_TYPE_NOTVALID;
         }
     } break;
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
-
     return ier;
 }
 
@@ -572,41 +538,21 @@ int cufinufft_destroy_impl(cufinufft_plan_t<T> *d_plan)
         Also see ../docs/cppdoc.md for main user-facing documentation.
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
-    // Can't destroy a Null pointer.
-    if (!d_plan) {
-        // Multi-GPU support: reset the device ID
-        cudaSetDevice(orig_gpu_device_id);
-        return 1;
-    }
+    cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
+    // Can't destroy a null pointer.
+    if (!d_plan)
+        return FINUFFT_ERR_PLAN_NOTVALID;
 
     if (d_plan->fftplan)
         cufftDestroy(d_plan->fftplan);
 
     using namespace cufinufft::memtransfer;
-    switch (d_plan->dim) {
-    case 1: {
-        freegpumemory1d<T>(d_plan);
-    } break;
-    case 2: {
-        freegpumemory2d<T>(d_plan);
-    } break;
-    case 3: {
-        freegpumemory3d<T>(d_plan);
-    } break;
-    }
+    freegpumemory<T>(d_plan);
 
     /* free/destruct the plan */
     delete d_plan;
-    /* set pointer to NULL now that we've hopefully free'd the memory. */
-    d_plan = NULL;
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
     return 0;
 } // namespace cufinufft
 #endif
diff --git a/include/cufinufft/memtransfer.h b/include/cufinufft/memtransfer.h
index d9a229ae0..382f911e9 100644
--- a/include/cufinufft/memtransfer.h
+++ b/include/cufinufft/memtransfer.h
@@ -11,19 +11,15 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan);
 template <typename T>
 int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan);
 template <typename T>
-void freegpumemory1d(cufinufft_plan_t<T> *d_plan);
+void freegpumemory(cufinufft_plan_t<T> *d_plan);
 template <typename T>
 int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan);
 template <typename T>
 int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan);
 template <typename T>
-void freegpumemory2d(cufinufft_plan_t<T> *d_plan);
-template <typename T>
 int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan);
 template <typename T>
 int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan);
-template <typename T>
-void freegpumemory3d(cufinufft_plan_t<T> *d_plan);
 
 } // namespace memtransfer
 } // namespace cufinufft
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 6f46e750a..fea996a9a 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -26,7 +26,6 @@ struct cuda_complex_impl<double> {
 template <typename T>
 using cuda_complex = typename cuda_complex_impl<T>::type;
 
-
 template <typename T>
 struct cufinufft_plan_t {
     cufinufft_opts opts;
@@ -46,7 +45,6 @@ struct cufinufft_plan_t {
     int iflag;
 
     int totalnumsubprob;
-    int byte_now;
     T *fwkerhalf1;
     T *fwkerhalf2;
     T *fwkerhalf3;
@@ -72,10 +70,8 @@ struct cufinufft_plan_t {
     int *subprob_to_nupts;
 
     cufftHandle fftplan;
-    cudaStream_t *streams;
 };
 
-
 template <typename T>
 static cufftType_t cufft_type();
 template <>
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 35c70428d..e8deb42e9 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -8,6 +8,8 @@
 #include <cuComplex.h>
 #include <cufinufft/types.h>
 
+#include <cuda_runtime.h>
+
 #include <sys/time.h>
 
 #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@@ -30,6 +32,18 @@ __inline__ __device__ double atomicAdd(double *address, double val) {
 
 namespace cufinufft {
 namespace utils {
+class WithCudaDevice {
+  public:
+    WithCudaDevice(int device) {
+        cudaGetDevice(&orig_device_);
+        cudaSetDevice(device);
+    }
+
+    ~WithCudaDevice() { cudaSetDevice(orig_device_); }
+
+  private:
+    int orig_device_;
+};
 
 // jfm timer class
 class CNTime {
diff --git a/include/cufinufft_errors.h b/include/cufinufft_errors.h
deleted file mode 100644
index 982ca0c82..000000000
--- a/include/cufinufft_errors.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef __CUFINUFFT_ERRORS_H__
-#define __CUFINUFFT_ERRORS_H__
-#include <cufft.h>
-
-// For error checking
-static const char *_cufftGetErrorEnum(cufftResult error) {
-    switch (error) {
-    case CUFFT_SUCCESS:
-        return "cufft_success";
-    case CUFFT_INVALID_PLAN:
-        return "cufft_invalid_plan";
-    case CUFFT_ALLOC_FAILED:
-        return "cufft_alloc_failed";
-    case CUFFT_INVALID_TYPE:
-        return "cufft_invalid_type";
-    case CUFFT_INVALID_VALUE:
-        return "cufft_invalid_value";
-    case CUFFT_INTERNAL_ERROR:
-        return "cufft_internal_error";
-    case CUFFT_EXEC_FAILED:
-        return "cufft_exec_failed";
-    case CUFFT_SETUP_FAILED:
-        return "cufft_setup_failed";
-    case CUFFT_INVALID_SIZE:
-        return "cufft_invalid_size";
-    case CUFFT_UNALIGNED_DATA:
-        return "cufft_unaligned data";
-    case CUFFT_INCOMPLETE_PARAMETER_LIST:
-        return "cufft_incomplete_parameter_list";
-    case CUFFT_INVALID_DEVICE:
-        return "cufft_invalid_device";
-    case CUFFT_PARSE_ERROR:
-        return "cufft_parse_error";
-    case CUFFT_NO_WORKSPACE:
-        return "cufft_no_workspace";
-    case CUFFT_NOT_IMPLEMENTED:
-        return "cufft_not_implemented";
-    case CUFFT_LICENSE_ERROR:
-        return "cufft_license_error";
-    case CUFFT_NOT_SUPPORTED:
-        return "cufft_not_supported";
-    }
-    return "<unknown>";
-}
-
-#endif
diff --git a/include/finufft.h b/include/finufft.h
index 61754564c..d0a42ee89 100644
--- a/include/finufft.h
+++ b/include/finufft.h
@@ -29,6 +29,9 @@
 #include <finufft_opts.h>
 #include <finufft_spread_opts.h>
 
+// Public error numbers
+#include <finufft_errors.h>
+
 // octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
 #include <stdint.h>
 #define FINUFFT_BIGINT int64_t
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index c6be287c9..77bc69b6b 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -20,27 +20,6 @@
 #include <finufft.h>
 
 
-// ---------- Global error/warning output codes for the library ---------------
-// (it could be argued these belong in finufft.h, but to avoid polluting
-//  user's name space we keep them here)
-// NB: if change these numbers, also must regen test/results/dumbinputs.refout
-#define WARN_EPS_TOO_SMALL       1
-// this means that a fine grid array dim exceeded MAX_NF; no malloc tried...
-#define ERR_MAXNALLOC            2
-#define ERR_SPREAD_BOX_SMALL     3
-#define ERR_SPREAD_PTS_OUT_RANGE 4
-#define ERR_SPREAD_ALLOC         5
-#define ERR_SPREAD_DIR           6
-#define ERR_UPSAMPFAC_TOO_SMALL  7
-#define ERR_HORNER_WRONG_BETA    8
-#define ERR_NTRANS_NOTVALID      9
-#define ERR_TYPE_NOTVALID        10
-// some generic internal allocation failure...
-#define ERR_ALLOC                11
-#define ERR_DIM_NOTVALID         12
-#define ERR_SPREAD_THREAD_NOTVALID 13
-
-
 // --------------- Private data types for compilation in either prec ---------
 // Devnote: must match those in relevant prec of public finufft.h interface!
 
@@ -63,7 +42,7 @@
 // ------------- Library-wide algorithm parameter settings ----------------
 
 // Library version (is a string)
-#define FINUFFT_VER "2.1.0"
+#define FINUFFT_VER "2.2.0.dev0"
 
 // Largest possible kernel spread width per dimension, in fine grid points
 // (used only in spreadinterp.cpp)
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
index 5d8f5406f..89d86f0de 100644
--- a/include/finufft/fftw_defs.h
+++ b/include/finufft/fftw_defs.h
@@ -45,10 +45,4 @@
   #define FFTW_CLEANUP_THREADS()
 #endif
 
-#ifdef FFTW_PLAN_SAFE
-  #define FFTW_PLAN_SF() FFTWIFY(make_planner_thread_safe())
-#else
-  #define FFTW_PLAN_SF()
-#endif
-
 #endif  // FFTW_DEFS_H
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
new file mode 100644
index 000000000..74dc8cc21
--- /dev/null
+++ b/include/finufft_errors.h
@@ -0,0 +1,29 @@
+#ifndef FINUFFT_ERRORS_H
+#define FINUFFT_ERRORS_H
+
+// ---------- Global error/warning output codes for the library ---------------
+// NB: if change these numbers, also must regen test/results/dumbinputs.refout
+#define FINUFFT_WARN_EPS_TOO_SMALL         1
+// this means that a fine grid array dim exceeded MAX_NF; no malloc tried...
+#define FINUFFT_ERR_MAXNALLOC              2
+#define FINUFFT_ERR_SPREAD_BOX_SMALL       3
+#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE   4
+#define FINUFFT_ERR_SPREAD_ALLOC           5
+#define FINUFFT_ERR_SPREAD_DIR             6
+#define FINUFFT_ERR_UPSAMPFAC_TOO_SMALL    7
+#define FINUFFT_ERR_HORNER_WRONG_BETA      8
+#define FINUFFT_ERR_NTRANS_NOTVALID        9
+#define FINUFFT_ERR_TYPE_NOTVALID          10
+// some generic internal allocation failure...
+#define FINUFFT_ERR_ALLOC                  11
+#define FINUFFT_ERR_DIM_NOTVALID           12
+#define FINUFFT_ERR_SPREAD_THREAD_NOTVALID 13
+#define FINUFFT_ERR_NDATA_NOTVALID         14
+// cuda malloc/memset/kernel failure/etc
+#define FINUFFT_ERR_CUDA_FAILURE           15
+#define FINUFFT_ERR_PLAN_NOTVALID          16
+#define FINUFFT_ERR_METHOD_NOTVALID        17
+#define FINUFFT_ERR_BINSIZE_NOTVALID       18
+#define FINUFFT_ERR_INSUFFICIENT_SHMEM     19
+
+#endif
diff --git a/make.inc.macosx_arm64 b/make.inc.macosx_arm64
index 88485755c..8889fb964 100644
--- a/make.inc.macosx_arm64
+++ b/make.inc.macosx_arm64
@@ -23,10 +23,10 @@ CXX=clang++
 CC=clang
 
 # taken from makefile...
-CFLAGS   += -I include -I/usr/local/include -I/opt/homebrew/include
+CFLAGS   += -I include -I/usr/local/include -I/opt/homebrew/include -I/opt/homebrew/opt/libomp/include
 FFLAGS   = $(CFLAGS)
 CXXFLAGS = $(CFLAGS)
-LIBS += -L/usr/local/lib -L/opt/homebrew/lib
+LIBS += -L/usr/local/lib -L/opt/homebrew/lib -L/opt/homebrew/opt/libomp/lib
 
 # OpenMP with clang needs following...
 OMPFLAGS = -Xpreprocessor -fopenmp
diff --git a/make.inc.macosx_gcc-12 b/make.inc.macosx_gcc-12
new file mode 100644
index 000000000..e274a4d53
--- /dev/null
+++ b/make.inc.macosx_gcc-12
@@ -0,0 +1,49 @@
+# Makefile variable overrides for Mac OSX compilation with GCC v.12
+#
+# Use this if you'll need to link against gfortran.
+#
+# Barnett 10/27/18. Input from Yu-Hsuan Shih, Amit Moskovich.
+# Lu minor modification for gcc-10 12/06/2020
+# Leslie and Dan F found XCode linking problem, workaround 10/5/23.
+
+# By default we use clang/LLVM (which is aliased to /usr/lib/gcc, etc).
+# This make.inc is if you want to override this.
+# Get gcc from brew then use, eg:
+CXX=g++-12
+CC=gcc-12
+FC=gfortran
+
+# as in makefile, but with the brew /usr/local/ stuff...
+CFLAGS   += -I src -I/usr/local/include -I/opt/homebrew/include
+FFLAGS   = $(CFLAGS)
+CXXFLAGS = $(CFLAGS)
+LIBS += -L/usr/local/lib -L/opt/homebrew/lib
+# Workaround to force old linker to be used due to XCode15 (Issue #360):
+LDFLAGS+=-ld64
+
+# If you're getting warning messages of the form:
+#    ld: warning: object file (lib-static/libfinufft.a(finufft1d.o)) was built for
+#    newer OSX version (10.13) than being linked (10.9)
+# Then you can uncomment the following two lines with the older version number
+# (in this example -mmacosx-version-min=10.9)
+#
+#CFLAGS += "-mmacosx-version-min=<OLDER OSX VERSION NUMBER>"
+
+# OpenMP with GCC on OSX needs following...
+OMPFLAGS = -fopenmp
+OMPLIBS = -L/usr/local/lib -lgomp
+# since fftw3_omp doesn't work in OSX, we need...
+FFTWOMPSUFFIX=threads
+
+# MATLAB interface:
+# some of these will depend on your FFTW library location...
+MFLAGS += -I/usr/local/include  -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
+# edit for your MATLAB version location...
+MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
+# Also see docs/install.rst for possible edits to MATLAB's MEX XML file.
+
+# If you have segfault of MATLAB then please try the following:
+#MOMPFLAGS = -D_OPENMP
+#OMPFLAGS = -Xpreprocessor -fopenmp
+#OMPLIBS = $(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/libiomp5.dylib
+# This links to MATLAB's omp not gomp.
diff --git a/makefile b/makefile
index 6bfbe0f90..96a7a596a 100644
--- a/makefile
+++ b/makefile
@@ -29,8 +29,6 @@ PYTHON = python3
 CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range $(CFLAGS)
 FFLAGS := $(CFLAGS) $(FFLAGS)
 CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
-# put this in your make.inc if you have FFTW>=3.3.5 and want thread-safe use...
-#CXXFLAGS += -DFFTW_PLAN_SAFE
 # FFTW base name, and math linking...
 FFTWNAME = fftw3
 # linux default is fftw3_omp, since 10% faster than fftw3_threads...
@@ -192,10 +190,6 @@ endif
 # examples (C++/C) -----------------------------------------------------------
 # build all examples (single-prec codes separate, and not all have one)...
 EXAMPLES = $(basename $(wildcard examples/*.c examples/*.cpp))
-# ...except only build threadsafe ones if user switch on (thus FFTW>=3.3.6):
-ifeq (,$(findstring FFTW_PLAN_SAFE,$(CXXFLAGS)))
-  EXAMPLES := $(filter-out %/threadsafe1d1 %/threadsafe2d2f, $(EXAMPLES))
-endif
 examples: $(EXAMPLES)
 ifneq ($(MINGW),ON)
   # Windows-MSYS does not find the dynamic libraries, so we make a temporary copy
@@ -442,7 +436,7 @@ objclean:
 ifneq ($(MINGW),ON)
   # non-Windows-WSL...
 	rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o
-	rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o
+	rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o finufft_mod.mod
 else
   # Windows-WSL...
 	for /d %%d in (src,test\directfttest,examples,matlab,contrib) do (for %%f in (%%d\*.o) do (del %%f))
diff --git a/matlab/Contents.m b/matlab/Contents.m
index 0cd218896..3a336d729 100644
--- a/matlab/Contents.m
+++ b/matlab/Contents.m
@@ -1,5 +1,5 @@
 % FINUFFT: Flatiron Institute Nonuniform Fast Fourier Transform
-% Version 2.1.0
+% Version 2.2.0.dev0
 %
 % Basic and many-vector interfaces
 %   finufft1d1 - 1D complex nonuniform FFT of type 1 (nonuniform to uniform).
diff --git a/matlab/errhandler.m b/matlab/errhandler.m
index f52731868..78410edd1 100644
--- a/matlab/errhandler.m
+++ b/matlab/errhandler.m
@@ -6,7 +6,7 @@ function errhandler(ier)
 % Note that there are other matlab-only error types defined in valid_*.m
 
 switch ier
- % These are the ERR_ #defines in ../include/defs.h:
+ % These are the ERR_ #defines in ../include/finufft_errors.h:
  case 1
   warning('FINUFFT:epsTooSmall','FINUFFT eps tolerance too small to achieve');
  case 2
diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu
index 12750ea4f..b2e6326ed 100644
--- a/perftest/cuda/cuperftest.cu
+++ b/perftest/cuda/cuperftest.cu
@@ -57,6 +57,7 @@ struct test_options_t {
                 {"N1", required_argument, 0, 0},
                 {"N2", required_argument, 0, 0},
                 {"N3", required_argument, 0, 0},
+                {"M", required_argument, 0, 0},
                 {"tol", required_argument, 0, 0},
                 {"method", required_argument, 0, 0},
                 {"kerevalmethod", required_argument, 0, 0},
@@ -87,7 +88,7 @@ struct test_options_t {
         N[2] = std::stof(get_or(options_map, "N3", "1"));
         M = std::stof(get_or(options_map, "M", "2E6"));
         method = std::stoi(get_or(options_map, "method", "1"));
-        kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "0"));
+        kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
         sort = std::stoi(get_or(options_map, "sort", "1"));
         tol = std::stof(get_or(options_map, "tol", "1E-5"));
     }
@@ -172,7 +173,7 @@ void run_test(test_options_t &test_opts) {
     for (int i = 0; i < 3; ++i)
         dim = test_opts.N[i] > 1 ? i + 1 : dim;
 
-    cufinufft_default_opts(test_opts.type, dim, &opts);
+    cufinufft_default_opts(&opts);
     opts.gpu_method = test_opts.method;
     opts.gpu_sort = test_opts.sort;
     opts.gpu_kerevalmeth = test_opts.kerevalmethod;
diff --git a/python/cufinufft/cufinufft/__init__.py b/python/cufinufft/cufinufft/__init__.py
index 730f6a693..31120bb96 100644
--- a/python/cufinufft/cufinufft/__init__.py
+++ b/python/cufinufft/cufinufft/__init__.py
@@ -3,5 +3,9 @@
 from cufinufft._simple import (nufft1d1, nufft1d2, nufft2d1, nufft2d2,
                                nufft3d1, nufft3d2)
 
-__all__ = ['cufinufft']
-__version__ = '2.2.0dev0'
+__all__ = ["nufft1d1", "nufft1d2",
+           "nufft2d1", "nufft2d2",
+           "nufft3d1", "nufft3d2",
+           "Plan"]
+
+__version__ = '2.2.0.dev0'
diff --git a/python/cufinufft/cufinufft/_compat.py b/python/cufinufft/cufinufft/_compat.py
new file mode 100644
index 000000000..04e066a1a
--- /dev/null
+++ b/python/cufinufft/cufinufft/_compat.py
@@ -0,0 +1,106 @@
+import inspect
+
+import numpy as np
+
+
+def get_array_ptr(data):
+    try:
+        return data.__cuda_array_interface__['data'][0]
+    except RuntimeError:
+        # Handle torch with gradient enabled
+        # https://github.com/flatironinstitute/finufft/pull/326#issuecomment-1652212770
+        return data.data_ptr()
+    except AttributeError:
+        raise TypeError("Invalid GPU array implementation. Implementation must implement the standard cuda array interface.")
+
+
+def get_array_module(obj):
+    module_name = inspect.getmodule(type(obj)).__name__
+
+    if module_name.startswith("numba.cuda"):
+        return "numba"
+    elif module_name.startswith("torch"):
+        return "torch"
+    elif module_name.startswith("pycuda"):
+        return "pycuda"
+    else:
+        return "generic"
+
+
+def get_array_size(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        return len(obj)
+    else:
+        return obj.size
+
+
+def get_array_dtype(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "torch":
+        dtype_str = str(obj.dtype)
+        dtype_str = dtype_str[len("torch."):]
+        return np.dtype(dtype_str)
+    else:
+        return obj.dtype
+
+
+def is_array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        return obj.is_c_contiguous()
+    elif array_module == "torch":
+        return obj.is_contiguous()
+    else:
+        return obj.flags.c_contiguous
+
+
+def array_can_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "pycuda":
+        return False
+    else:
+        return True
+
+
+def array_contiguous(obj):
+    array_module = get_array_module(obj)
+
+    if array_module == "numba":
+        import numba
+        ret = numba.cuda.device_array(obj.shape, obj.dtype, stream=obj.stream)
+        ret[:] = obj[:]
+        return ret
+    if array_module == "torch":
+        return obj.contiguous()
+    else:
+        return obj.copy(order="C")
+
+
+def array_empty_like(obj, *args, **kwargs):
+    module_name = get_array_module(obj)
+
+    if module_name == "numba":
+        import numba.cuda
+        return numba.cuda.device_array(*args, **kwargs)
+    elif module_name == "torch":
+        import torch
+        if "shape" in kwargs:
+            kwargs["size"] = kwargs.pop("shape")
+        if "dtype" in kwargs:
+            dtype = kwargs.pop("dtype")
+            if dtype == np.complex64:
+                dtype = torch.complex64
+            elif dtype == np.complex128:
+                dtype = torch.complex128
+            kwargs["dtype"] = dtype
+        if "device" not in kwargs:
+            kwargs["device"] = obj.device
+
+        return torch.empty(*args, **kwargs)
+    else:
+        return type(obj)(*args, **kwargs)
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
index 80e83643b..3e0128ce1 100644
--- a/python/cufinufft/cufinufft/_cufinufft.py
+++ b/python/cufinufft/cufinufft/_cufinufft.py
@@ -17,8 +17,6 @@
     warnings.filterwarnings("ignore", category=DeprecationWarning)
     import imp
 
-import numpy as np
-
 from ctypes import c_double
 from ctypes import c_int
 from ctypes import c_int64
@@ -93,8 +91,8 @@ class NufftOpts(ctypes.Structure):
 NufftOpts_p = ctypes.POINTER(NufftOpts)
 
 _default_opts = lib.cufinufft_default_opts
-_default_opts.argtypes = [c_int, c_int, NufftOpts_p]
-_default_opts.restype = c_int
+_default_opts.argtypes = [NufftOpts_p]
+_default_opts.restype = None
 
 _make_plan = lib.cufinufft_makeplan
 _make_plan.argtypes = [
diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
index 084619990..a165cbe20 100644
--- a/python/cufinufft/cufinufft/_plan.py
+++ b/python/cufinufft/cufinufft/_plan.py
@@ -25,7 +25,7 @@
 from cufinufft._cufinufft import _destroy_plan
 from cufinufft._cufinufft import _destroy_planf
 
-from pycuda.gpuarray import GPUArray
+from cufinufft import _compat
 
 
 # If we are shutting down python, we don't need to run __del__
@@ -115,7 +115,7 @@ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
         self._maxbatch = 1    # TODO: optimize this one day
 
         # Get the default option values.
-        self._opts = self._default_opts(nufft_type, self.dim)
+        self._opts = self._default_opts()
 
         # Extract list of valid field names.
         field_names = [name for name, _ in self._opts._fields_]
@@ -135,7 +135,7 @@ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
         self._references = []
 
     @staticmethod
-    def _default_opts(nufft_type, dim):
+    def _default_opts():
         """
         Generates a cufinufft opt struct of the dtype coresponding to plan.
 
@@ -147,10 +147,7 @@ def _default_opts(nufft_type, dim):
 
         nufft_opts = NufftOpts()
 
-        ier = _default_opts(nufft_type, dim, nufft_opts)
-
-        if ier != 0:
-            raise RuntimeError('Configuration not yet implemented.')
+        _default_opts(nufft_opts)
 
         return nufft_opts
 
@@ -206,7 +203,7 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
 
         _x, _y, _z = _ensure_valid_pts(_x, _y, _z, self.dim)
 
-        M = _x.size
+        M = _compat.get_array_size(_x)
 
         # Because FINUFFT/cufinufft are internally column major,
         #   we will reorder the pts axes. Reordering references
@@ -217,17 +214,17 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
         #     (x, y, None)    ~>  (y, x, None)
         #     (x, y, z)       ~>  (z, y, x)
         # Via code, we push each dimension onto a stack of axis
-        fpts_axes = [_x.ptr, None, None]
+        fpts_axes = [_compat.get_array_ptr(_x), None, None]
 
         # We will also store references to these arrays.
         #   This keeps python from prematurely cleaning them up.
         self._references.append(_x)
         if self.dim >= 2:
-            fpts_axes.insert(0, _y.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_y))
             self._references.append(_y)
 
         if self.dim >= 3:
-            fpts_axes.insert(0, _z.ptr)
+            fpts_axes.insert(0, _compat.get_array_ptr(_z))
             self._references.append(_z)
 
         # Then take three items off the stack as our reordered axis.
@@ -278,14 +275,16 @@ def execute(self, data, out=None):
         req_out_shape = batch_shape + req_out_shape
 
         if out is None:
-            _out = GPUArray(req_out_shape, dtype=self.dtype)
+            _out = _compat.array_empty_like(_data, req_out_shape, dtype=self.dtype)
         else:
             _out = _ensure_array_shape(_out, "out", req_out_shape)
 
         if self.type == 1:
-            ier = self._exec_plan(self._plan, data.ptr, _out.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_data),
+                    _compat.get_array_ptr(_out))
         elif self.type == 2:
-            ier = self._exec_plan(self._plan, _out.ptr, data.ptr)
+            ier = self._exec_plan(self._plan, _compat.get_array_ptr(_out),
+                    _compat.get_array_ptr(_data))
 
         if ier != 0:
             raise RuntimeError('Error executing plan.')
@@ -315,27 +314,21 @@ def __del__(self):
 
 def _ensure_array_type(x, name, dtype, output=False):
     if x is None:
-        return GPUArray(0, dtype=dtype, order="C")
+        return None
 
-    if x.dtype != dtype:
+    if _compat.get_array_dtype(x) != dtype:
         raise TypeError(f"Argument `{name}` does not have the correct dtype: "
                         f"{x.dtype} was given, but {dtype} was expected.")
 
-    if not x.flags.c_contiguous:
-        if output:
+    if not _compat.is_array_contiguous(x):
+        if output or not _compat.array_can_contiguous(x):
             raise TypeError(f"Argument `{name}` does not satisfy the "
                             f"following requirement: C")
         else:
-            raise TypeError(f"Argument `{name}` does not satisfy the "
-                            f"following requirement: C")
-
-            # Ideally we'd copy the array into the correct ordering here, but
-            # this does not seem possible as of pycuda 2022.2.2.
-
-            # warnings.warn(f"Argument `{name}` does not satisfy the "
-            #               f"following requirement: C. Copying array (this may
-            #               reduce performance)")
-            # x = gpuarray.GPUArray(x, dtype=dtype, order="C")
+            warnings.warn(f"Argument `{name}` does not satisfy the "
+                          f"following requirement: C. Copying array "
+                          f"(this may reduce performance)")
+            x = _compat.array_contiguous(x)
 
     return x
 
@@ -354,22 +347,21 @@ def _ensure_array_shape(x, name, shape, allow_reshape=False):
     else:
         return x
 
+
 def _ensure_valid_pts(x, y, z, dim):
     if x.ndim != 1:
         raise TypeError(f"Argument `x` must be a vector")
 
-    M = x.size
-
     if dim >= 2:
         y = _ensure_array_shape(y, "y", x.shape)
 
     if dim >= 3:
         z = _ensure_array_shape(z, "z", x.shape)
 
-    if dim < 3 and z.size > 0:
+    if dim < 3 and z is not None and _compat.get_array_size(z) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `z` was specified")
 
-    if dim < 2 and y.size > 0:
+    if dim < 2 and y is not None and _compat.get_array_size(y) > 0:
         raise TypeError(f"Plan dimension is {dim}, but `y` was specified")
 
     return x, y, z
diff --git a/python/cufinufft/cufinufft/_simple.py b/python/cufinufft/cufinufft/_simple.py
index 587868a0d..ac36e90ab 100644
--- a/python/cufinufft/cufinufft/_simple.py
+++ b/python/cufinufft/cufinufft/_simple.py
@@ -1,25 +1,30 @@
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
-def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1):
-    return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes)
+def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+    return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes,
+            kwargs)
 
-def nufft1d2(x, data, out=None, eps=1e-6, isign=-1):
-    return _invoke_plan(1, 2, x, None, None, data, out, isign, eps)
+def nufft1d2(x, data, out=None, eps=1e-6, isign=-1, **kwargs):
+    return _invoke_plan(1, 2, x, None, None, data, out, isign, eps, None,
+            kwargs)
 
-def nufft2d1(x, y, data, n_modes=None, out=None, eps=1e-6, isign=1):
-    return _invoke_plan(2, 1, x, y, None, data, out, isign, eps, n_modes)
+def nufft2d1(x, y, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+    return _invoke_plan(2, 1, x, y, None, data, out, isign, eps, n_modes,
+            kwargs)
 
-def nufft2d2(x, y, data, out=None, eps=1e-6, isign=-1):
-    return _invoke_plan(2, 2, x, y, None, data, out, isign, eps)
+def nufft2d2(x, y, data, out=None, eps=1e-6, isign=-1, **kwargs):
+    return _invoke_plan(2, 2, x, y, None, data, out, isign, eps, None, kwargs)
 
-def nufft3d1(x, y, z, data, n_modes=None, out=None, eps=1e-6, isign=1):
-    return _invoke_plan(3, 1, x, y, z, data, out, isign, eps, n_modes)
+def nufft3d1(x, y, z, data, n_modes=None, out=None, eps=1e-6, isign=1,
+        **kwargs):
+    return _invoke_plan(3, 1, x, y, z, data, out, isign, eps, n_modes, kwargs)
 
-def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1):
-    return _invoke_plan(3, 2, x, y, z, data, out, isign, eps)
+def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1, **kwargs):
+    return _invoke_plan(3, 2, x, y, z, data, out, isign, eps, None, kwargs)
 
-def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps, n_modes=None):
-    dtype = data.dtype
+def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps,
+        n_modes=None, kwargs=None):
+    dtype = _compat.get_array_dtype(data)
 
     n_trans = _get_ntrans(dim, nufft_type, data)
 
@@ -28,7 +33,7 @@ def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps, n_modes=None):
     if nufft_type == 2:
         n_modes = data.shape[-dim:]
 
-    plan = Plan(nufft_type, n_modes, n_trans, eps, isign, dtype)
+    plan = Plan(nufft_type, n_modes, n_trans, eps, isign, dtype, **kwargs)
 
     plan.setpts(x, y, z)
 
diff --git a/python/cufinufft/examples/example2d1many.py b/python/cufinufft/examples/example2d1_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d1many.py
rename to python/cufinufft/examples/example2d1_pycuda.py
diff --git a/python/cufinufft/examples/example2d2many.py b/python/cufinufft/examples/example2d2_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d2many.py
rename to python/cufinufft/examples/example2d2_pycuda.py
diff --git a/python/cufinufft/examples/getting_started.py b/python/cufinufft/examples/getting_started_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/getting_started.py
rename to python/cufinufft/examples/getting_started_pycuda.py
diff --git a/python/cufinufft/requirements.txt b/python/cufinufft/requirements.txt
index fcbec6659..bc2cbbd1c 100644
--- a/python/cufinufft/requirements.txt
+++ b/python/cufinufft/requirements.txt
@@ -1,3 +1,2 @@
 numpy
-pycuda
 six
diff --git a/python/cufinufft/setup.py b/python/cufinufft/setup.py
index b1d63cd29..129e5d864 100644
--- a/python/cufinufft/setup.py
+++ b/python/cufinufft/setup.py
@@ -37,7 +37,7 @@
 # Python Package Setup
 setup(
     name='cufinufft',
-    version='2.2.0dev0',
+    version='2.2.0.dev0',
     author='Yu-shuan Melody Shih, Garrett Wright, Joakim Anden, Johannes Blaschke, Alex Barnett',
     author_email='janden-vscholar@flatironinstitute.org',
     url='https://github.com/flatironinstitute/cufinufft',
diff --git a/python/cufinufft/tests/conftest.py b/python/cufinufft/tests/conftest.py
new file mode 100644
index 000000000..56528681f
--- /dev/null
+++ b/python/cufinufft/tests/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+
+import utils
+
+
+def pytest_addoption(parser):
+    parser.addoption("--framework", action="append", default=[], help="List of frameworks")
+
+def pytest_generate_tests(metafunc):
+    if "framework" in metafunc.fixturenames:
+        metafunc.parametrize("framework", metafunc.config.getoption("framework"))
+
+@pytest.fixture
+def to_gpu(framework):
+    to_gpu, _ = utils.transfer_funcs(framework)
+
+    return to_gpu
+
+
+@pytest.fixture
+def to_cpu(framework):
+    _, to_cpu = utils.transfer_funcs(framework)
+
+    return to_cpu
diff --git a/python/cufinufft/tests/test_array_ordering.py b/python/cufinufft/tests/test_array_ordering.py
index 61d704371..0fba8f8f5 100644
--- a/python/cufinufft/tests/test_array_ordering.py
+++ b/python/cufinufft/tests/test_array_ordering.py
@@ -2,39 +2,26 @@
 
 import numpy as np
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
-def test_type2_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
-    complex_dtype = utils._complex_dtype(dtype)
-
-    k = utils.gen_nu_pts(M).astype(dtype)
-    fk = utils.gen_uniform_data(shape).astype(complex_dtype)
 
-    fkTT = fk.T.copy().T
-
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
-    fkTT_gpu = gpuarray.to_gpu(fkTT)
+def test_type1_ordering(to_gpu, to_cpu, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+    complex_dtype = utils._complex_dtype(dtype)
 
-    plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
+    k, c = utils.type1_problem(dtype, shape, M)
 
-    plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
-    c_gpu = plan.execute(fk_gpu)
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
-    with pytest.raises(TypeError) as err:
-        cTT_gpu = plan.execute(fkTT_gpu)
-    assert "following requirement: C" in err.value.args[0]
+    plan.setpts(*k_gpu)
 
-    # Ideally, it should be possible to get this to align with true output,
-    # but corrently does not look like it.
+    out = np.empty(shape, dtype=complex_dtype, order="F")
 
-    # c = c_gpu.get()
-    # cTT = cTT_gpu.get()
+    out_gpu = to_gpu(out)
 
-    # assert np.allclose(c, cTT, rtol=1e-2)
+    with pytest.raises(TypeError, match="following requirement: C") as err:
+        plan.execute(c_gpu, out=out_gpu)
diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
index aa12b19ab..f4bc5713a 100644
--- a/python/cufinufft/tests/test_basic.py
+++ b/python/cufinufft/tests/test_basic.py
@@ -1,118 +1,112 @@
-import numpy as np
+import pytest
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
+import numpy as np
 
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
+# NOTE: Tests below fail for tolerance 1e-4 (error executing plan).
+
+DTYPES = [np.float32, np.float64]
+SHAPES = [(16,), (16, 16), (16, 16, 16)]
+MS = [256, 1024, 4096]
+TOLS = [1e-3, 1e-6]
+OUTPUT_ARGS = [False, True]
+CONTIGUOUS = [False, True]
 
-def _test_type1(dtype, shape=(16, 16, 16), M=4096, tol=1e-3, output_arg=True):
-    complex_dtype = utils._complex_dtype(dtype)
 
-    dim = len(shape)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("M", MS)
+@pytest.mark.parametrize("tol", TOLS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
+    complex_dtype = utils._complex_dtype(dtype)
 
-    k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
-    c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+    k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
-    plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+    # Since k_gpu is an array of shape (dim, M), this will expand to
+    # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
+    # dimensions with the same call.
+    plan.setpts(*k_gpu)
 
     if output_arg:
-        fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+        fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
         plan.execute(c_gpu, out=fk_gpu)
     else:
         fk_gpu = plan.execute(c_gpu)
 
-    fk = fk_gpu.get()
-
-    ind = int(0.1789 * np.prod(shape))
-
-    fk_est = fk.ravel()[ind]
-    fk_target = utils.direct_type1(c, k, shape, ind)
+    fk = to_cpu(fk_gpu)
 
-    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
+    utils.verify_type1(k, c, fk, tol)
 
-    print('Type 1 relative error:', type1_rel_err)
 
-    assert type1_rel_err < 0.01
-
-
-def test_type1_32(shape=(16, 16, 16), M=4096, tol=1e-3):
-    _test_type1(dtype=np.float32, shape=shape, M=M, tol=tol,
-        output_arg=True)
-    _test_type1(dtype=np.float32, shape=shape, M=M, tol=tol,
-        output_arg=False)
-
-
-def test_type1_64(shape=(16, 16, 16), M=4096, tol=1e-3):
-    _test_type1(dtype=np.float64, shape=shape, M=M, tol=tol,
-        output_arg=True)
-    _test_type1(dtype=np.float64, shape=shape, M=M, tol=tol,
-        output_arg=False)
-
-
-def _test_type2(dtype, shape=(16, 16, 16), M=4096, tol=1e-3, output_arg=True):
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("M", MS)
+@pytest.mark.parametrize("tol", TOLS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+@pytest.mark.parametrize("contiguous", CONTIGUOUS)
+def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
     complex_dtype = utils._complex_dtype(dtype)
 
-    k = utils.gen_nu_pts(M).astype(dtype)
-    fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
+    k, fk = utils.type2_problem(dtype, shape, M)
 
     plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
 
-    plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+    check_result = True
 
-    if output_arg:
-        c_gpu = gpuarray.GPUArray(shape=(M,), dtype=complex_dtype)
-        plan.execute(fk_gpu, out=c_gpu)
-    else:
-        c_gpu = plan.execute(fk_gpu)
-
-    c = c_gpu.get()
-
-    ind = M // 2
+    if not contiguous and len(shape) > 1:
+        fk = fk.copy(order="F")
 
-    c_est = c[ind]
-    c_target = utils.direct_type2(fk, k[:, ind])
+        if _compat.array_can_contiguous(to_gpu(np.empty(1))):
+            def _execute(*args, **kwargs):
+                with pytest.warns(UserWarning, match="requirement: C. Copying"):
+                    return plan.execute(*args, **kwargs)
+        else:
+            check_result = False
 
-    type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)
+            def _execute(*args, **kwargs):
+                with pytest.raises(TypeError, match="requirement: C"):
+                    plan.execute(*args, **kwargs)
 
-    print('Type 2 relative error:', type2_rel_err)
+    else:
+        def _execute(*args, **kwargs):
+            return plan.execute(*args, **kwargs)
 
-    assert type2_rel_err < 0.01
+    k_gpu = to_gpu(k)
+    fk_gpu = to_gpu(fk)
 
+    plan.setpts(*k_gpu)
 
-def test_type2_32(shape=(16, 16, 16), M=4096, tol=1e-3):
-    _test_type2(dtype=np.float32, shape=shape, M=M, tol=tol, output_arg=True)
-    _test_type2(dtype=np.float32, shape=shape, M=M, tol=tol, output_arg=False)
+    if output_arg:
+        c_gpu = _compat.array_empty_like(fk_gpu, (M,), dtype=complex_dtype)
+        _execute(fk_gpu, out=c_gpu)
+    else:
+        c_gpu = _execute(fk_gpu)
 
+    if check_result:
+        c = to_cpu(c_gpu)
 
-def test_type2_64(shape=(16, 16, 16), M=4096, tol=1e-3):
-    _test_type2(dtype=np.float64, shape=shape, M=M, tol=tol, output_arg=True)
-    _test_type2(dtype=np.float64, shape=shape, M=M, tol=tol, output_arg=False)
+        utils.verify_type2(k, fk, c, tol)
 
 
-def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
+def test_opts(to_gpu, to_cpu, shape=(8, 8, 8), M=32, tol=1e-3):
     dtype = np.float32
 
     complex_dtype = utils._complex_dtype(dtype)
 
-    dim = len(shape)
-
-    k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
-    c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+    k, c = utils.type1_problem(dtype, shape, M)
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
-    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
+    fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype, gpu_sort=False,
                      gpu_maxsubprobsize=10)
@@ -121,24 +115,6 @@ def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
 
     plan.execute(c_gpu, fk_gpu)
 
-    fk = fk_gpu.get()
-
-    ind = int(0.1789 * np.prod(shape))
-
-    fk_est = fk.ravel()[ind]
-    fk_target = utils.direct_type1(c, k, shape, ind)
-
-    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
-
-    assert type1_rel_err < 0.01
-
-
-def main():
-    test_type1_32()
-    test_type2_32()
-    test_type1_64()
-    test_type2_64()
-
+    fk = to_cpu(fk_gpu)
 
-if __name__ == '__main__':
-    main()
+    utils.verify_type1(k, c, fk, tol)
diff --git a/python/cufinufft/tests/test_error_checks.py b/python/cufinufft/tests/test_error_checks.py
index ca4b42415..6a9a6b4aa 100644
--- a/python/cufinufft/tests/test_error_checks.py
+++ b/python/cufinufft/tests/test_error_checks.py
@@ -1,15 +1,11 @@
 import numpy as np
 import pytest
 
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
 
 import utils
 
-
-def test_set_nu_raises_on_dtype():
+def test_set_nu_raises_on_dtype(to_gpu):
     dtype = np.complex64
 
     M = 4096
@@ -19,10 +15,10 @@ def test_set_nu_raises_on_dtype():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     # Here we'll intentionally contruct an incorrect array dtype.
-    kxyz_gpu_wrong_type = gpuarray.to_gpu(kxyz.astype(np.float64))
+    kxyz_gpu_wrong_type = to_gpu(kxyz.real.astype(np.float64))
 
     plan = Plan(1, shape, eps=tol, dtype=dtype)
 
@@ -40,7 +36,7 @@ def test_set_nu_raises_on_dtype():
                      kxyz_gpu_wrong_type[1], kxyz_gpu_wrong_type[2])
 
 
-def test_set_pts_raises_on_size():
+def test_set_pts_raises_on_size(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -51,26 +47,68 @@ def test_set_pts_raises_on_size():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
+    kxyz_gpu = to_gpu(kxyz)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
-    with pytest.raises(TypeError) as err:
+    with pytest.raises(TypeError, match="`y` must be of shape") as err:
         plan.setpts(kxyz_gpu[0], kxyz_gpu[1][:4])
-    assert '`y` must be of shape' in err.value.args[0]
 
-    with pytest.raises(TypeError) as err:
+    with pytest.raises(TypeError, match="`z` must be of shape") as err:
         plan.setpts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2][:4])
-    assert '`z` must be of shape' in err.value.args[0]
+
+
+def test_set_pts_raises_on_nonvector(to_gpu):
+    dtype = np.float32
+    complex_dtype = np.complex64
+
+    M = 8
+    tol = 1e-3
+    shape = (16, 16, 16)
+    dim = len(shape)
+
+    kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
+
+    kxyz_gpu = to_gpu(kxyz)
+
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+    with pytest.raises(TypeError, match="`x` must be a vector") as err:
+        plan.setpts(kxyz)
+
+
+def test_set_pts_raises_on_number_of_args(to_gpu):
+    dtype = np.float32
+    complex_dtype = np.complex64
+
+    M = 8
+    tol = 1e-3
+    shape = (16,)
+    dim = len(shape)
+
+    kxyz = utils.gen_nu_pts(M, dim=3).astype(dtype)
+
+    kxyz_gpu = to_gpu(kxyz)
+
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+    with pytest.raises(TypeError, match="is 1, but `y` was specified") as err:
+        plan.setpts(*kxyz_gpu[:2])
+
+    shape = (16, 16)
+
+    plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+    with pytest.raises(TypeError, match="is 2, but `z` was specified") as err:
+        plan.setpts(*kxyz_gpu)
 
 
 def test_wrong_field_names():
-    with pytest.raises(TypeError) as err:
+    with pytest.raises(TypeError, match="Invalid option 'foo'") as err:
         plan = Plan(1, (8, 8), foo="bar")
-    assert "Invalid option 'foo'" in err.value.args[0]
 
 
-def test_exec_raises_on_dtype():
+def test_exec_raises_on_dtype(to_gpu):
     dtype = np.float32
     complex_dtype = np.complex64
 
@@ -81,14 +119,17 @@ def test_exec_raises_on_dtype():
 
     kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
     c = utils.gen_nonuniform_data(M).astype(complex_dtype)
-    c_gpu = gpuarray.to_gpu(c)
+    c_gpu = to_gpu(c)
     # Using c.real gives us wrong dtype here...
-    c_gpu_wrong_dtype = gpuarray.to_gpu(c.real)
+    # Need contiguous here since numba does not allow transfers of
+    # non-contiguous arrays.
+    c_gpu_wrong_dtype = to_gpu(np.ascontiguousarray(c.real))
 
-    kxyz_gpu = gpuarray.to_gpu(kxyz)
-    fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+    kxyz_gpu = to_gpu(kxyz)
+    fk_gpu = _compat.array_empty_like(kxyz_gpu, shape, dtype=complex_dtype)
     # Here we'll intentionally contruct an incorrect array dtype.
-    fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128)
+    fk_gpu_wrong_dtype = _compat.array_empty_like(fk_gpu, shape,
+            dtype=np.complex128)
 
     plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
 
@@ -102,11 +143,14 @@ def test_exec_raises_on_dtype():
         plan.execute(c_gpu_wrong_dtype, fk_gpu)
 
 
+def test_dtype_errors():
+    with pytest.raises(TypeError, match="Expected complex64 or complex128") as err:
+        Plan(1, (8, 8), dtype="uint8")
+
+
 def test_dtype_warnings():
-    with pytest.warns(DeprecationWarning) as record:
+    with pytest.warns(DeprecationWarning, match="Converting to complex64") as record:
         Plan(1, (8, 8), dtype="float32")
-    assert "Converting to complex64" in record[0].message.args[0]
 
-    with pytest.warns(DeprecationWarning) as record:
+    with pytest.warns(DeprecationWarning, match="Converting to complex128") as record:
         Plan(1, (8, 8), dtype="float64")
-    assert "Converting to complex128" in record[0].message.args[0]
diff --git a/python/cufinufft/tests/test_examples.py b/python/cufinufft/tests/test_examples.py
index 34fe610a8..c6fb5dd45 100644
--- a/python/cufinufft/tests/test_examples.py
+++ b/python/cufinufft/tests/test_examples.py
@@ -17,5 +17,11 @@
         scripts.append(os.path.join(examples_dir, filename))
 
 @pytest.mark.parametrize("filename", scripts)
-def test_example(filename):
-    subprocess.check_call([sys.executable, filename])
+def test_example(filename, request):
+    # Extract framework from format `example_framework.py`.
+    framework = Path(filename).stem.split("_")[-1]
+
+    if framework in request.config.getoption("framework"):
+        subprocess.check_call([sys.executable, filename])
+    else:
+        pytest.skip("Example not in list of frameworks")
diff --git a/python/cufinufft/tests/test_multi.py b/python/cufinufft/tests/test_multi.py
index c05810c4d..a8e392fed 100644
--- a/python/cufinufft/tests/test_multi.py
+++ b/python/cufinufft/tests/test_multi.py
@@ -1,16 +1,18 @@
 import pytest
 
 import numpy as np
-
-import pycuda.driver as drv
-import pycuda.gpuarray as gpuarray
-
 from cufinufft import Plan
 
 import utils
 
 
-def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+def test_multi_type1(framework, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+    if framework == "pycuda":
+        import pycuda.driver as drv
+        import pycuda.gpuarray as gpuarray
+    else:
+        pytest.skip("Multi-GPU support only tested for pycuda")
+
     complex_dtype = utils._complex_dtype(dtype)
 
     drv.init()
@@ -59,11 +61,3 @@ def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
         errs.append(type1_rel_err)
 
     assert all(err < 0.01 for err in errs)
-
-
-def main():
-    test_multi_type1()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/python/cufinufft/tests/test_simple.py b/python/cufinufft/tests/test_simple.py
index 906d85c27..70b602143 100644
--- a/python/cufinufft/tests/test_simple.py
+++ b/python/cufinufft/tests/test_simple.py
@@ -2,86 +2,85 @@
 
 import numpy as np
 
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-
 import cufinufft
+from cufinufft import _compat
 
 import utils
 
 DTYPES = [np.float32, np.float64]
-SHAPES = [(64,), (64, 64), (64, 64, 64)]
+SHAPES = [(16,), (16, 16), (16, 16, 16)]
+N_TRANS = [(), (1,), (2,)]
 MS = [256, 1024, 4096]
-TOLS = [1e-2, 1e-3]
+TOLS = [1e-3, 1e-6]
 OUTPUT_ARGS = [False, True]
 
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("shape", SHAPES)
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_nufft3d1(dtype, M, tol, output_arg):
-    shape = (64, 64, 64)
-
+def test_simple_type1(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
     real_dtype = dtype
     complex_dtype = utils._complex_dtype(dtype)
 
     dim = len(shape)
 
-    k = utils.gen_nu_pts(M, dim=dim).astype(real_dtype)
-    c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+    # Select which function to call based on dimension.
+    fun = {1: cufinufft.nufft1d1,
+           2: cufinufft.nufft2d1,
+           3: cufinufft.nufft3d1}[dim]
 
-    k_gpu = gpuarray.to_gpu(k)
-    c_gpu = gpuarray.to_gpu(c)
+    k, c = utils.type1_problem(dtype, shape, M, n_trans=n_trans)
 
-    if output_arg:
-        fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
-        cufinufft.nufft3d1(*k_gpu, c_gpu, out=fk_gpu, eps=tol)
-    else:
-        fk_gpu = cufinufft.nufft3d1(*k_gpu, c_gpu, shape, eps=tol)
+    k_gpu = to_gpu(k)
+    c_gpu = to_gpu(c)
 
-    fk = fk_gpu.get()
-
-    ind = int(0.1789 * np.prod(shape))
+    if output_arg:
+        # Ensure that output array has proper shape i.e., (N1, ...) for no
+        # batch, (1, N1, ...) for batch of size one, and (n, N1, ...) for
+        # batch of size n.
+        fk_gpu = _compat.array_empty_like(c_gpu, n_trans + shape,
+                dtype=complex_dtype)
 
-    fk_est = fk.ravel()[ind]
-    fk_target = utils.direct_type1(c, k, shape, ind)
+        fun(*k_gpu, c_gpu, out=fk_gpu, eps=tol)
+    else:
+        fk_gpu = fun(*k_gpu, c_gpu, shape, eps=tol)
 
-    type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
+    fk = to_cpu(fk_gpu)
 
-    assert type1_rel_err < 10 * tol
+    utils.verify_type1(k, c, fk, tol)
 
 
 @pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_trans", N_TRANS)
 @pytest.mark.parametrize("M", MS)
 @pytest.mark.parametrize("tol", TOLS)
 @pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_nufft3d2(dtype, M, tol, output_arg):
-    shape = (64, 64, 64)
-
+def test_simple_type2(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
     real_dtype = dtype
     complex_dtype = utils._complex_dtype(dtype)
 
     dim = len(shape)
 
-    k = utils.gen_nu_pts(M, dim=dim).astype(real_dtype)
-    fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
-    k_gpu = gpuarray.to_gpu(k)
-    fk_gpu = gpuarray.to_gpu(fk)
+    fun = {1: cufinufft.nufft1d2,
+           2: cufinufft.nufft2d2,
+           3: cufinufft.nufft3d2}[dim]
 
-    if output_arg:
-        c_gpu = gpuarray.GPUArray((M,), dtype=complex_dtype)
-        cufinufft.nufft3d2(*k_gpu, fk_gpu, out=c_gpu)
-    else:
-        c_gpu = cufinufft.nufft3d2(*k_gpu, fk_gpu)
+    k, fk = utils.type2_problem(dtype, shape, M, n_trans=n_trans)
 
-    c = c_gpu.get()
+    k_gpu = to_gpu(k)
+    fk_gpu = to_gpu(fk)
 
-    ind = M // 2
+    if output_arg:
+        c_gpu = _compat.array_empty_like(fk_gpu, n_trans + (M,),
+                dtype=complex_dtype)
 
-    c_est = c[ind]
-    c_target = utils.direct_type2(fk, k[:, ind])
+        fun(*k_gpu, fk_gpu, eps=tol, out=c_gpu)
+    else:
+        c_gpu = fun(*k_gpu, fk_gpu, eps=tol)
 
-    type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)
+    c = to_cpu(c_gpu)
 
-    assert type2_rel_err < 10 * tol
+    utils.verify_type2(k, fk, c, tol)
diff --git a/python/cufinufft/tests/utils.py b/python/cufinufft/tests/utils.py
index 9f7e70a63..9ea3281f3 100644
--- a/python/cufinufft/tests/utils.py
+++ b/python/cufinufft/tests/utils.py
@@ -37,35 +37,121 @@ def gen_uniform_data(shape, seed=0):
     return fk
 
 
-def gen_nonuniform_data(M, seed=0):
+def gen_nonuniform_data(M, seed=0, n_trans=()):
     np.random.seed(seed)
-    c = np.random.standard_normal(2 * M)
+    c = np.random.standard_normal(2 * M * int(np.prod(n_trans)))
     c = c.astype(np.float64).view(np.complex128)
+    c = c.reshape(n_trans + (M,))
     return c
 
 
+def type1_problem(dtype, shape, M, n_trans=()):
+    complex_dtype = _complex_dtype(dtype)
+    dim = len(shape)
+
+    k = gen_nu_pts(M, dim=dim).astype(dtype)
+    c = gen_nonuniform_data(M, n_trans=n_trans).astype(complex_dtype)
+
+    return k, c
+
+
+def type2_problem(dtype, shape, M, n_trans=()):
+    complex_dtype = _complex_dtype(dtype)
+    dim = len(shape)
+
+    k = gen_nu_pts(M, dim=dim).astype(dtype)
+    fk = gen_uniform_data(n_trans + shape).astype(complex_dtype)
+
+    return k, fk
+
+
 def make_grid(shape):
     dim = len(shape)
-    shape = (1,) * (3 - dim) + shape
+    shape = shape
 
     grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape]
-    x, y, z = np.meshgrid(*grids, indexing='ij')
-    return np.stack((x, y, z))
+    grids = np.meshgrid(*grids, indexing='ij')
+    return np.stack(grids)
 
 
 def direct_type1(c, k, shape, ind):
+    dim = len(shape)
+
     grid = make_grid(shape)
 
-    phase = k.T @ grid.reshape((3, -1))[:, ind]
-    fk = np.sum(c * np.exp(1j * phase))
+    phase = k.T @ grid.reshape((dim, -1))[:, ind]
+    fk = np.sum(c * np.exp(1j * phase), -1)
 
     return fk
 
 
-def direct_type2(fk, k):
-    grid = make_grid(fk.shape)
+def direct_type2(fk, k, dim):
+    grid = make_grid(fk.shape[-dim:])
 
-    phase = k @ grid.reshape((3, -1))
-    c = np.sum(fk.ravel() * np.exp(-1j * phase))
+    phase = k @ grid.reshape((dim, -1))
+
+    # Ravel the spatial dimensions only.
+    fk = fk.reshape(fk.shape[:-dim] + (np.prod(fk.shape[-dim:]),))
+    c = np.sum(fk * np.exp(-1j * phase), -1)
 
     return c
+
+
+def verify_type1(k, c, fk, tol):
+    dim = fk.ndim - (c.ndim - 1)
+
+    shape = fk.shape[-dim:]
+
+    ind = int(0.1789 * np.prod(shape))
+
+    # Ravel the spatial dimensions only.
+    fk_est = fk.reshape(fk.shape[:-dim] + (np.prod(fk.shape[-dim:]),))[..., ind]
+    fk_target = direct_type1(c, k, shape, ind)
+
+    type1_rel_err = np.linalg.norm(fk_target - fk_est) / np.linalg.norm(fk_target)
+
+    assert type1_rel_err < 25 * tol
+
+
+def verify_type2(k, fk, c, tol):
+    dim = fk.ndim - (c.ndim - 1)
+
+    M = c.shape[-1]
+
+    ind = M // 2
+
+    c_est = c[..., ind]
+    c_target = direct_type2(fk, k[:, ind], dim)
+
+    type2_rel_err = np.linalg.norm(c_target - c_est) / np.linalg.norm(c_target)
+
+    assert type2_rel_err < 25 * tol
+
+
+def transfer_funcs(module_name):
+    if module_name == "pycuda":
+        import pycuda.autoinit # NOQA:401
+        from pycuda.gpuarray import to_gpu
+        def to_cpu(obj):
+            return obj.get()
+    elif module_name == "cupy":
+        import cupy
+        def to_gpu(obj):
+            return cupy.array(obj)
+        def to_cpu(obj):
+            return obj.get()
+    elif module_name == "numba":
+        import numba.cuda
+        to_gpu = numba.cuda.to_device
+        def to_cpu(obj):
+            return obj.copy_to_host()
+    elif module_name == "torch":
+        import torch
+        def to_gpu(obj):
+            return torch.as_tensor(obj, device=torch.device("cuda"))
+        def to_cpu(obj):
+            return obj.cpu().numpy()
+    else:
+        raise TypeError(f"Unsupported framework: {module_name}")
+
+    return to_gpu, to_cpu
diff --git a/python/finufft/finufft/__init__.py b/python/finufft/finufft/__init__.py
index b98791c76..adb129d7e 100644
--- a/python/finufft/finufft/__init__.py
+++ b/python/finufft/finufft/__init__.py
@@ -17,4 +17,4 @@
 from finufft._interfaces import nufft2d1,nufft2d2,nufft2d3
 from finufft._interfaces import nufft3d1,nufft3d2,nufft3d3
 
-__version__ = '2.1.0'
+__version__ = '2.2.0.dev0'
diff --git a/python/finufft/finufft/_interfaces.py b/python/finufft/finufft/_interfaces.py
index d4d26dd4b..e7197ecba 100644
--- a/python/finufft/finufft/_interfaces.py
+++ b/python/finufft/finufft/_interfaces.py
@@ -100,26 +100,41 @@ def __init__(self,nufft_type,n_modes_or_dim,n_trans=1,eps=1e-6,isign=None,dtype=
         _finufft._default_opts(opts)
         setkwopts(opts,**kwargs)
 
+        dtype = np.dtype(dtype)
+
+        if dtype == np.float64:
+            warnings.warn("Real dtypes are currently deprecated and will be "
+                          "removed in version 2.3. Converting to complex128.",
+                          DeprecationWarning)
+            dtype = np.complex128
+
+        if dtype == np.float32:
+            warnings.warn("Real dtypes are currently deprecated and will be "
+                          "removed in version 2.3. Converting to complex64.",
+                          DeprecationWarning)
+            dtype = np.complex64
+
         is_single = is_single_dtype(dtype)
 
         # construct plan based on precision type and eps default value
         plan = c_void_p(None)
 
         # setting n_modes and dim for makeplan
-        n_modes = np.ones([3], dtype=np.int64)
         if nufft_type==3:
             npdim = np.asarray(n_modes_or_dim, dtype=np.int64)
             if npdim.size != 1:
                 raise RuntimeError('FINUFFT type 3 plan n_modes_or_dim must be one number, the dimension')
             dim = int(npdim)
+            n_modes = np.ones([dim], dtype=np.int64)
         else:
             npmodes = np.asarray(n_modes_or_dim, dtype=np.int64)
             if npmodes.size>3 or npmodes.size<1:
                 raise RuntimeError("FINUFFT n_modes dimension must be 1, 2, or 3")
             dim = int(npmodes.size)
+            n_modes = np.ones([dim], dtype=np.int64)
             n_modes[0:dim] = npmodes[::-1]
 
-        n_modes = (c_longlong * 3)(*n_modes)
+        n_modes = (c_longlong * dim)(*n_modes)
 
         if is_single:
             self._makeplan = _finufft._makeplanf
@@ -204,8 +219,6 @@ def setpts(self,x=None,y=None,z=None,s=None,t=None,u=None):
             ier = self._setpts(self.inner_plan, self.nj, self._yj, self._xj, self._zj, self.nk, self._t, self._s, self._u)
         elif self.dim == 3:
             ier = self._setpts(self.inner_plan, self.nj, self._zj, self._yj, self._xj, self.nk, self._u, self._t, self._s)
-        else:
-            raise RuntimeError("FINUFFT dimension must be 1, 2, or 3")
 
         if ier != 0:
             err_handler(ier)
@@ -242,9 +255,7 @@ def execute(self,data,out=None):
         dim = self.dim
 
         if tp==1 or tp==2:
-            ms = self.n_modes[0]
-            mt = self.n_modes[1]
-            mu = self.n_modes[2]
+            ms, mt, mu = [*self.n_modes, *([1]*(3-len(self.n_modes)))]
 
         # input shape and size check
         if tp==2:
@@ -264,11 +275,11 @@ def execute(self,data,out=None):
         # allocate out if None
         if out is None:
             if tp==1:
-                _out = np.squeeze(np.zeros([n_trans, mu, mt, ms], dtype=self.dtype, order='C'))
+                _out = np.zeros([*data.shape[:-1], *self.n_modes[::-1]], dtype=self.dtype, order='C')
             if tp==2:
-                _out = np.squeeze(np.zeros([n_trans, nj], dtype=self.dtype, order='C'))
+                _out = np.zeros([*data.shape[:-dim], nj], dtype=self.dtype, order='C')
             if tp==3:
-                _out = np.squeeze(np.zeros([n_trans, nk], dtype=self.dtype, order='C'))
+                _out = np.zeros([*data.shape[:-1], nk], dtype=self.dtype, order='C')
 
         # call execute based on type and precision type
         if tp==1 or tp==3:
@@ -279,19 +290,12 @@ def execute(self,data,out=None):
             ier = self._execute(self.inner_plan,
                                 _out.ctypes.data_as(c_void_p),
                                 _data.ctypes.data_as(c_void_p))
-        else:
-            ier = 10
 
         # check error
         if ier != 0:
             err_handler(ier)
 
-        # return out
-        if out is None:
-            return _out
-        else:
-            _copy(_out,out)
-            return out
+        return _out
 
 
     def __del__(self):
@@ -327,15 +331,6 @@ def _ensure_array_type(x, name, dtype, output=False):
     return x
 
 
-### David Stein's functions for checking input and output variables
-def _copy(_x, x):
-    """
-    Copy _x to x, only if the underlying data of _x differs from that of x
-    """
-    if _x is not x:
-        x[:] = _x
-
-
 ### error handler (keep up to date with FINUFFT/include/defs.h)
 def err_handler(ier):
     switcher = {
@@ -480,9 +475,9 @@ def valid_fshape(fshape,n_trans,dim,ms,mt,mu,nk,tp):
 def is_single_dtype(dtype):
     dtype = np.dtype(dtype)
 
-    if dtype == np.dtype('float64') or dtype == np.dtype('complex128'):
+    if dtype == np.dtype('complex128'):
         return False
-    elif dtype == np.dtype('float32') or dtype == np.dtype('complex64'):
+    elif dtype == np.dtype('complex64'):
         return True
     else:
         raise RuntimeError('FINUFFT dtype(precision type) must be single or double')
@@ -505,22 +500,20 @@ def setkwopts(opt,**kwargs):
 
 ### destroy
 def destroy(plan):
-    if plan is None:
-        return
+    if hasattr(plan, "inner_plan"):
+        ier = plan._destroy(plan.inner_plan)
 
-    ier = plan._destroy(plan.inner_plan)
-
-    if ier != 0:
-        err_handler(ier)
+        if ier != 0:
+            err_handler(ier)
 
 
 ### invoke guru interface, this function is used for simple interfaces
 def invoke_guru(dim,tp,x,y,z,c,s,t,u,f,isign,eps,n_modes,**kwargs):
     # infer dtype from x
     if x.dtype is np.dtype('float64'):
-        pdtype = 'double'
+        pdtype = 'complex128'
     elif x.dtype is np.dtype('float32'):
-        pdtype = 'single'
+        pdtype = 'complex64'
     else:
         raise RuntimeError('FINUFFT x dtype should be float64 for double precision or float32 for single precision')
     # check n_modes type, n_modes must be a tuple or an integer
diff --git a/python/finufft/setup.py b/python/finufft/setup.py
index 9c89b19ed..acd0daae0 100644
--- a/python/finufft/setup.py
+++ b/python/finufft/setup.py
@@ -5,11 +5,12 @@
 # attempt ../make.inc reading (failed) and default finufftdir. 2/25/20
 # Barnett trying to get sphinx.ext.autodoc to work w/ this, 10/5/20
 
-__version__ = '2.1.0'
+__version__ = '2.2.0.dev0'
 
 from setuptools import setup, Extension
 import os
 import platform
+from pathlib import Path
 
 from tempfile import mkstemp
 
@@ -18,8 +19,7 @@
 # Note: This will not work if run through pip install since setup.py is copied
 # to a different location.
 if finufft_dir == None or finufft_dir == '':
-    current_path = os.path.abspath(__file__)
-    finufft_dir = os.path.dirname(os.path.dirname(current_path))
+    finufft_dir = Path(__file__).resolve().parents[2]
 
 # Set include and library paths relative to FINUFFT root directory.
 inc_dir = os.path.join(finufft_dir, 'include')
diff --git a/python/finufft/test/accuracy_speed_tests.py b/python/finufft/test/accuracy_speed_tests.py
index 2b690d491..78de3d7dd 100644
--- a/python/finufft/test/accuracy_speed_tests.py
+++ b/python/finufft/test/accuracy_speed_tests.py
@@ -21,7 +21,8 @@ def print_report(label,elapsed,Xest,Xtrue,npts):
 	print(label+':')
 	print('    Est rel l2 err  %.3g' % (compute_error(Xest,Xtrue)))
 	print('    CPU time (sec)  %.3g' % (elapsed))
-	print('    tot NU pts/sec  %.3g' % (npts/elapsed))
+	if (elapsed>0):
+		print('    tot NU pts/sec  %.3g' % (npts/elapsed))
 	print('')
 
 def accuracy_speed_tests(num_nonuniform_points,num_uniform_points,eps):
diff --git a/python/finufft/test/test_finufft_plan.py b/python/finufft/test/test_finufft_plan.py
new file mode 100644
index 000000000..2bc0d7d64
--- /dev/null
+++ b/python/finufft/test/test_finufft_plan.py
@@ -0,0 +1,245 @@
+import pytest
+
+import numpy as np
+
+from finufft import Plan
+
+import utils
+
+
+# NOTE: Only doing single transformations for now.
+SHAPES = [(7,), (8,), (7, 7), (7, 8), (8, 8), (7, 7, 7), (7, 8, 8), (8, 8, 8)]
+N_PTS = [10, 11]
+DTYPES = [np.complex64, np.complex128]
+OUTPUT_ARGS = [False, True]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft1_plan(dtype, shape, n_pts, output_arg):
+    pts, coefs = utils.type1_problem(dtype, shape, n_pts)
+
+    plan = Plan(1, shape, dtype=dtype)
+
+    plan.setpts(*pts)
+
+    if not output_arg:
+        sig = plan.execute(coefs)
+    else:
+        sig = np.empty(shape, dtype=dtype)
+        plan.execute(coefs, out=sig)
+
+    utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft2_plan(dtype, shape, n_pts, output_arg):
+    pts, sig = utils.type2_problem(dtype, shape, n_pts)
+
+    plan = Plan(2, shape, dtype=dtype)
+
+    plan.setpts(*pts)
+
+    if not output_arg:
+        coefs = plan.execute(sig)
+    else:
+        coefs = np.empty(n_pts, dtype=dtype)
+        plan.execute(sig, out=coefs)
+
+    utils.verify_type2(pts, sig, coefs, 1e-6)
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dim", list(set(len(shape) for shape in SHAPES)))
+@pytest.mark.parametrize("n_source_pts", N_PTS)
+@pytest.mark.parametrize("n_target_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft3_plan(dtype, dim, n_source_pts, n_target_pts, output_arg):
+    source_pts, source_coefs, target_pts = utils.type3_problem(dtype,
+            dim, n_source_pts, n_target_pts)
+
+    plan = Plan(3, dim, dtype=dtype)
+
+    plan.setpts(*source_pts, *((None,) * (3 - dim)), *target_pts)
+
+    if not output_arg:
+        target_coefs = plan.execute(source_coefs)
+    else:
+        target_coefs = np.empty(n_target_pts, dtype=dtype)
+        plan.execute(source_coefs, out=target_coefs)
+
+    utils.verify_type3(source_pts, source_coefs, target_pts, target_coefs, 1e-6)
+
+
+def test_finufft_plan_modeord():
+    dtype = "complex64"
+    shape = (8, 8)
+    n_pts = 17
+
+    plan = Plan(1, shape, dtype=dtype, modeord=1)
+
+    pts, coefs = utils.type1_problem(dtype, shape, n_pts)
+
+    plan.setpts(*pts)
+
+    sig = plan.execute(coefs)
+
+    sig = np.fft.fftshift(sig)
+
+    utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+
+def test_finufft_plan_errors():
+    with pytest.raises(RuntimeError, match="must be single or double"):
+        Plan(1, (8, 8), dtype="uint32")
+
+    with pytest.warns(Warning, match="finufft_opts does not have"):
+        Plan(1, (8, 8), foo="bar")
+
+    with pytest.raises(RuntimeError, match="type 3 plan n_modes_or_dim must be"):
+        Plan(3, (1, 2))
+
+    with pytest.raises(RuntimeError, match="n_modes dimension must be 1, 2"):
+        Plan(2, (1, 2, 3, 4))
+
+    with pytest.warns(Warning, match="eps tolerance too small"):
+        Plan(1, (8, 8), eps=1e-30)
+
+    with pytest.raises(TypeError, match="does not have the correct dtype"):
+        Plan(1, (8, 8), dtype="complex64").setpts(np.ones(1, dtype="complex128"))
+
+    with pytest.raises(TypeError, match="the following requirement: C"):
+        plan = Plan(1, (8, 8), dtype="complex64")
+        plan.setpts(*np.ones((2, 1), dtype="float32"))
+        plan.execute(np.ones(1, dtype="complex64"), out=np.ones((8, 8),
+            dtype="complex64", order="F"))
+
+    with pytest.raises(TypeError, match="the following requirement: W"):
+        plan = Plan(1, (8, 8), dtype="complex64")
+        plan.setpts(*np.ones((2, 1), dtype="float32"))
+        out = np.ones((8, 8), dtype="complex64")
+        out.setflags(write=False)
+        plan.execute(np.ones(1, dtype="complex64"), out=out)
+
+    with pytest.warns(Warning, match="the following requirement: C. Copying"):
+        plan = Plan(2, (8, 8), dtype="complex64")
+        plan.setpts(*np.ones((2, 1), dtype="float32"))
+        plan.execute(np.ones((8, 8), dtype="complex64", order="F"))
+
+    vec = np.ones(1, dtype="float32")
+    not_vec = np.ones((2, 1), dtype="float32")
+
+    with pytest.raises(RuntimeError, match="x must be a vector"):
+        Plan(1, (8, 8, 8), dtype="complex64").setpts(not_vec, vec, vec)
+
+    with pytest.raises(RuntimeError, match="y must be a vector"):
+        Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, not_vec, vec)
+
+    with pytest.raises(RuntimeError, match="z must be a vector"):
+        Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, vec, not_vec)
+
+    with pytest.raises(RuntimeError, match="s must be a vector"):
+        Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, not_vec, vec, vec)
+
+    with pytest.raises(RuntimeError, match="t must be a vector"):
+        Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, not_vec, vec)
+
+    with pytest.raises(RuntimeError, match="u must be a vector"):
+        Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, vec, not_vec)
+
+    vec = np.ones(3, dtype="float32")
+    long_vec = np.ones(4, dtype="float32")
+
+    with pytest.raises(RuntimeError, match="y must have same length as x"):
+        Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, long_vec, vec)
+
+    with pytest.raises(RuntimeError, match="z must have same length as x"):
+        Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, vec, long_vec)
+
+    with pytest.raises(RuntimeError, match="t must have same length as s"):
+        Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, long_vec, vec)
+
+    with pytest.raises(RuntimeError, match="u must have same length as s"):
+        Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, vec, long_vec)
+
+    with pytest.raises(RuntimeError, match="c.ndim must be 1 if n_trans = 1"):
+        plan = Plan(1, (8,), dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones((2, 1), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match="c.size must be same as x.size"):
+        plan = Plan(1, (8,), dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones(4, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match="c.ndim must be 2 if n_trans > 1"):
+        plan = Plan(1, (8,), n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones(2, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"c.shape must be \(n_trans, x.size\)"):
+        plan = Plan(1, (8,), n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones((2, 4), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match="same as the problem dimension for"):
+        plan = Plan(2, (8,), dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones((2, 8), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"same as the problem dimension \+ 1 for"):
+        plan = Plan(2, (8,), n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones(8, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.shape\[0\] must be n_trans"):
+        plan = Plan(2, (8,), n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones((3, 8), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+        plan = Plan(2, (8,), dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"))
+        plan.execute(np.ones(2, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+        plan = Plan(2, (8, 9), dtype="complex64")
+        plan.setpts(*np.ones((2, 3), dtype="float32"))
+        plan.execute(np.ones((2, 9), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+        plan = Plan(2, (8, 9, 10), dtype="complex64")
+        plan.setpts(*np.ones((3, 3), dtype="float32"))
+        plan.execute(np.ones((2, 9, 10), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.ndim must be 1"):
+        plan = Plan(3, 1, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+        plan.execute(np.ones(3, dtype="complex64"), out=np.ones((2, 3), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.size of must be nk"):
+        plan = Plan(3, 1, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+        plan.execute(np.ones(3, dtype="complex64"), out=np.ones(4, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.ndim must be 2"):
+        plan = Plan(3, 1, n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+        plan.execute(np.ones((2, 3), dtype="complex64"), out=np.ones(3, dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"f\.shape must be \(n_trans, nk\)"):
+        plan = Plan(3, 1, n_trans=2, dtype="complex64")
+        plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+        plan.execute(np.ones((2, 3), dtype="complex64"), out=np.ones((2, 4), dtype="complex64"))
+
+    with pytest.raises(RuntimeError, match=r"point out of range \[-3pi,3pi\]"):
+        plan = Plan(1, (8,), dtype="complex64")
+        plan.setpts(4 * np.pi * np.ones(1, dtype="float32"))
+
+    with pytest.raises(RuntimeError, match="transform type invalid"):
+        plan = Plan(4, (8,))
diff --git a/python/finufft/test/test_finufft_simple.py b/python/finufft/test/test_finufft_simple.py
new file mode 100644
index 000000000..29008b8bb
--- /dev/null
+++ b/python/finufft/test/test_finufft_simple.py
@@ -0,0 +1,123 @@
+import pytest
+
+import numpy as np
+import finufft
+
+import utils
+
+
+SHAPES = [(7,), (8,), (7, 7), (7, 8), (8, 8), (7, 7, 7), (7, 8, 8), (8, 8, 8)]
+N_PTS = [10, 11]
+N_TRANS = [(), (2,)]
+DTYPES = [np.complex64, np.complex128]
+OUTPUT_ARGS = [False, True]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft1_simple(dtype, shape, n_pts, n_trans, output_arg):
+    dim = len(shape)
+
+    funs = {1: finufft.nufft1d1,
+            2: finufft.nufft2d1,
+            3: finufft.nufft3d1}
+
+    fun = funs[dim]
+
+    pts, coefs = utils.type1_problem(dtype, shape, n_pts, n_trans)
+
+    # See if it can handle square sizes from ints
+    if all(n == shape[0] for n in shape):
+        _shape = shape[0]
+    else:
+        _shape = shape
+
+    if not output_arg:
+        sig = fun(*pts, coefs, _shape)
+    else:
+        sig = np.empty(n_trans + shape, dtype=dtype)
+        fun(*pts, coefs, out=sig)
+
+    utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft2_simple(dtype, shape, n_pts, n_trans, output_arg):
+    dim = len(shape)
+
+    funs = {1: finufft.nufft1d2,
+            2: finufft.nufft2d2,
+            3: finufft.nufft3d2}
+
+    fun = funs[dim]
+
+    pts, sig = utils.type2_problem(dtype, shape, n_pts, n_trans)
+
+    if not output_arg:
+        coefs = fun(*pts, sig)
+    else:
+        coefs = np.empty(n_trans + (n_pts,), dtype=dtype)
+        fun(*pts, sig, out=coefs)
+
+    utils.verify_type2(pts, sig, coefs, 1e-6)
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dim", list(set(len(shape) for shape in SHAPES)))
+@pytest.mark.parametrize("n_source_pts", N_PTS)
+@pytest.mark.parametrize("n_target_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft3_simple(dtype, dim, n_source_pts, n_target_pts, n_trans, output_arg):
+    funs = {1: finufft.nufft1d3,
+            2: finufft.nufft2d3,
+            3: finufft.nufft3d3}
+
+    fun = funs[dim]
+
+    source_pts, source_coefs, target_pts = utils.type3_problem(dtype,
+            dim, n_source_pts, n_target_pts, n_trans)
+
+    if not output_arg:
+        target_coefs = fun(*source_pts, source_coefs, *target_pts)
+    else:
+        target_coefs = np.empty(n_trans + (n_target_pts,), dtype=dtype)
+        fun(*source_pts, source_coefs, *target_pts, out=target_coefs)
+
+    utils.verify_type3(source_pts, source_coefs, target_pts, target_coefs, 1e-6)
+
+def test_finufft_simple_errors():
+    with pytest.raises(RuntimeError, match="x dtype should be"):
+        finufft.nufft1d1(np.zeros(1, "int64"), np.zeros(1), (4,))
+
+    with pytest.raises(RuntimeError, match="n_modes must be"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1), "")
+
+    with pytest.raises(RuntimeError, match="n_modes dimension does not match"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1), (2, 2))
+
+    with pytest.raises(RuntimeError, match="elements of input n_modes must"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1), (2.5,))
+
+    with pytest.raises(RuntimeError, match="type 1 input must supply n_modes"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1))
+
+    with pytest.raises(RuntimeError, match="c.size must be divisible by x.size"):
+        finufft.nufft1d1(np.zeros(2), np.zeros(3, np.complex128), 4)
+
+    with pytest.raises(RuntimeError, match="type 2 input dimension must be either dim"):
+        finufft.nufft1d2(np.zeros(1), np.zeros((2, 2, 2), np.complex128))
+
+    with pytest.raises(RuntimeError, match="type 2 input dimension must be either dim"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1, np.complex128), 4, out=np.zeros((2, 2, 4), np.complex128))
+
+    with pytest.raises(RuntimeError, match="input n_trans and output n_trans do not match"):
+        finufft.nufft1d1(np.zeros(1), np.zeros((3, 1), np.complex128), 4, out=np.zeros((2, 4), np.complex128))
+
+    with pytest.raises(RuntimeError, match="input n_modes and output n_modes do not match"):
+        finufft.nufft1d1(np.zeros(1), np.zeros(1, np.complex128), 4, out=np.zeros((3), np.complex128))
diff --git a/python/finufft/test/utils.py b/python/finufft/test/utils.py
new file mode 100644
index 000000000..5d372f7f0
--- /dev/null
+++ b/python/finufft/test/utils.py
@@ -0,0 +1,194 @@
+import numpy as np
+
+
+def _complex_dtype(dtype):
+    dtype = np.dtype(dtype)
+
+    if dtype == np.float32:
+        complex_dtype = np.complex64
+    elif dtype == np.float64:
+        complex_dtype = np.complex128
+    else:
+        raise TypeError("dtype should be np.float32 or np.float64.")
+
+    return complex_dtype
+
+
+def _real_dtype(complex_dtype):
+    complex_dtype = np.dtype(complex_dtype)
+
+    if complex_dtype == np.complex64:
+        real_dtype = np.float32
+    elif complex_dtype == np.complex128:
+        real_dtype = np.float64
+    else:
+        raise TypeError("dtype should be np.complex64 or np.complex128.")
+
+    return real_dtype
+
+
+def gen_nu_pts(M, dim=3, seed=0):
+    np.random.seed(seed)
+    k = np.random.uniform(-np.pi, np.pi, (dim, M))
+    k = k.astype(np.float64)
+    return k
+
+
+def gen_uniform_data(shape, seed=0):
+    np.random.seed(seed)
+    fk = np.random.standard_normal(shape + (2,))
+    fk = fk.astype(np.float64).view(np.complex128)[..., 0]
+    return fk
+
+
+def gen_nonuniform_data(M, seed=0, n_trans=()):
+    np.random.seed(seed)
+    c = np.random.standard_normal(2 * M * int(np.prod(n_trans)))
+    c = c.astype(np.float64).view(np.complex128)
+    c = c.reshape(n_trans + (M,))
+    return c
+
+
+def gen_sig_idx(n_modes, n_tr):
+    idx = tuple(np.random.randint(0, n) for n in n_tr + n_modes)
+    return idx
+
+
+def gen_coef_ind(n_pts, n_tr):
+    ind = tuple(np.random.randint(0, n) for n in n_tr + (n_pts,))
+    return ind
+
+
+def type1_problem(dtype, shape, M, n_trans=()):
+    real_dtype = _real_dtype(dtype)
+    dim = len(shape)
+
+    k = gen_nu_pts(M, dim=dim).astype(real_dtype)
+    c = gen_nonuniform_data(M, n_trans=n_trans).astype(dtype)
+
+    return k, c
+
+
+def type2_problem(dtype, shape, M, n_trans=()):
+    real_dtype = _real_dtype(dtype)
+    dim = len(shape)
+
+    k = gen_nu_pts(M, dim=dim).astype(real_dtype)
+    fk = gen_uniform_data(n_trans + shape).astype(dtype)
+
+    return k, fk
+
+def type3_problem(dtype, dim, n_source_pts, n_target_pts, n_trans=()):
+    real_dtype = _real_dtype(dtype)
+
+    source_pts = gen_nu_pts(n_source_pts, dim=dim).astype(real_dtype)
+    source_coefs = gen_nonuniform_data(n_source_pts, n_trans=n_trans).astype(dtype)
+    target_pts = gen_nu_pts(n_target_pts, dim=dim).astype(real_dtype)
+
+    return source_pts, source_coefs, target_pts
+
+
+def make_grid(shape):
+    dim = len(shape)
+    shape = shape
+
+    grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape]
+    grids = np.meshgrid(*grids, indexing='ij')
+    return np.stack(grids)
+
+
+def direct_type1(pts, coefs, n_modes, idx):
+    dtype = coefs.dtype
+    dim = len(n_modes)
+
+    _idx = (np.array(idx[-dim:]) - np.floor(np.array(n_modes) / 2)).astype(dtype)
+
+    _coefs = coefs[idx[:-dim]]
+
+    sig = np.sum(np.exp(1j * np.sum(_idx[:, np.newaxis] * pts, axis=0)) * _coefs)
+
+    return sig
+
+
+def direct_type2(pts, sig, ind):
+    dtype = sig.dtype
+    dim = pts.shape[0]
+    n_modes = sig.shape[-dim:]
+
+    grids = [slice(-np.floor(n / 2), np.ceil(n / 2)) for n in n_modes]
+    grid = np.mgrid[grids].astype(dtype)
+
+    pt = pts[:, ind[-1]]
+    pt = pt.reshape((dim,) + dim * (1,))
+
+    _sig = sig[ind[:-1]]
+
+    coef = np.sum(np.exp(-1j * np.sum(pt * grid, axis=0)) * _sig)
+
+    return coef
+
+
+def direct_type3(source_pts, source_coefs, target_pts, ind):
+    target_pt = target_pts[:, ind[-1]]
+    target_pt = target_pt[:, np.newaxis]
+
+    _source_coef = source_coefs[ind[:-1]]
+
+    target_coef = np.sum(np.exp(1j * np.sum(target_pt * source_pts, axis=0)) * _source_coef)
+
+    return target_coef
+
+
+def verify_type1(pts, coefs, shape, sig_est, tol):
+    dim = pts.shape[0]
+
+    n_trans = coefs.shape[:-1]
+
+    assert sig_est.shape[:-dim] == n_trans
+    assert sig_est.shape[-dim:] == shape
+
+    idx = gen_sig_idx(shape, n_trans)
+
+    fk_est = sig_est[idx]
+    fk_target = direct_type1(pts, coefs, shape, idx)
+
+    type1_rel_err = np.linalg.norm(fk_target - fk_est) / np.linalg.norm(fk_target)
+
+    assert type1_rel_err < 25 * tol
+
+
+def verify_type2(pts, sig, coefs_est, tol):
+    dim = pts.shape[0]
+
+    n_trans = sig.shape[:-dim]
+    n_pts = pts.shape[-1]
+
+    assert coefs_est.shape == n_trans + (n_pts,)
+
+    ind = gen_coef_ind(n_pts, n_trans)
+
+    c_est = coefs_est[ind]
+    c_target = direct_type2(pts, sig, ind)
+
+    type2_rel_err = np.linalg.norm(c_target - c_est) / np.linalg.norm(c_target)
+
+    assert type2_rel_err < 25 * tol
+
+
+def verify_type3(source_pts, source_coef, target_pts, target_coef, tol):
+    dim = source_pts.shape[0]
+
+    n_source_pts = source_pts.shape[-1]
+    n_target_pts = target_pts.shape[-1]
+    n_tr = source_coef.shape[:-1]
+
+    assert target_coef.shape == n_tr + (n_target_pts,)
+
+    ind = gen_coef_ind(n_source_pts, n_tr)
+
+    target_est = target_coef[ind]
+    target_true = direct_type3(source_pts, source_coef, target_pts, ind)
+
+    type3_rel_err = np.linalg.norm(target_est - target_true) / np.linalg.norm(target_true)
+
+    assert type3_rel_err < 100 * tol
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 841726376..dce4cf67c 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -1,6 +1,6 @@
 #include <cmath>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <type_traits>
@@ -33,36 +33,36 @@ int cufinufft1d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
 {
     assert(d_plan->spopts.spread_direction == 1);
 
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
         d_plan->c = d_cstart;
         d_plan->fk = d_fkstart;
 
-        checkCudaErrors(
-            cudaMemset(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>))); // this is needed
+        // this is needed
+        if ((ier = checkCudaErrors(
+                 cudaMemset(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex<T>)))))
+            return ier;
 
         // Step 1: Spread
-        ier = cuspread1d<T>(d_plan, blksize);
-
-        if (ier != 0) {
-            printf("error: cuspread1d, method(%d)\n", d_plan->opts.gpu_method);
+        if ((ier = cuspread1d<T>(d_plan, blksize)))
             return ier;
-        }
 
         // Step 2: FFT
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        cudeconvolve1d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
+            return ier;
     }
 
-    return ier;
+    return 0;
 }
 
 template <typename T>
@@ -82,12 +82,11 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
 {
     assert(d_plan->spopts.spread_direction == 2);
 
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
 
@@ -95,21 +94,23 @@ int cufinufft1d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        cudeconvolve1d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve1d<T>(d_plan, blksize)))
+            return ier;
 
         // Step 2: FFT
         cudaDeviceSynchronize();
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        RETURN_IF_CUDA_ERROR
 
-        // Step 3: deconvolve and shuffle
-        ier = cuinterp1d<T>(d_plan, blksize);
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
-        if (ier != 0) {
-            printf("error: cuinterp1d, method(%d)\n", d_plan->opts.gpu_method);
+        // Step 3: deconvolve and shuffle
+        if ((ier = cuinterp1d<T>(d_plan, blksize)))
             return ier;
-        }
     }
-    return ier;
+
+    return 0;
 }
 
 template int cufinufft1d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index 29037290e..36fb6a0f0 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -1,5 +1,5 @@
 #include <cuComplex.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 
@@ -54,15 +54,11 @@ inline int cufinufft_interp1d(int nf1, cuda_complex<T> *d_fw, int M, T *d_kx, cu
     }
 
     ier = cuinterp1d<T>(d_plan, 1);
-    freegpumemory1d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
 
-template int cufinufft_interp1d(int nf1, cuda_complex<float> *d_fw, int M, float *d_kx, cuda_complex<float> *d_c,
-                                cufinufft_plan_t<float> *d_plan);
-template int cufinufft_interp1d(int nf1, cuda_complex<double> *d_fw, int M, double *d_kx, cuda_complex<double> *d_c,
-                                cufinufft_plan_t<double> *d_plan);
 
 template <typename T>
 int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
@@ -83,14 +79,10 @@ int cuinterp1d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuinterp1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread1d_gpu_nuptsdriven" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cout << "error: incorrect method, should be 1" << std::endl;
-        return 2;
+        std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
 
     return ier;
@@ -119,20 +111,27 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven_horner<<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
-                                                                      sigma, d_idxnupts, pirange);
+            interp_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
+                                                                     es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            interp_1d_nuptsdriven<<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c,
-                                                               es_beta, d_idxnupts, pirange);
+            interp_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
+                                                                     es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
     return 0;
 }
 
+template int cufinufft_interp1d(int nf1, cuda_complex<float> *d_fw, int M, float *d_kx, cuda_complex<float> *d_c,
+                                cufinufft_plan_t<float> *d_plan);
+template int cufinufft_interp1d(int nf1, cuda_complex<double> *d_fw, int M, double *d_kx, cuda_complex<double> *d_c,
+                                cufinufft_plan_t<double> *d_plan);
 template int cuinterp1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
+
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/1d/spread1d_wrapper.cu b/src/cuda/1d/spread1d_wrapper.cu
index a6730a29f..233b40f1c 100644
--- a/src/cuda/1d/spread1d_wrapper.cu
+++ b/src/cuda/1d/spread1d_wrapper.cu
@@ -1,5 +1,5 @@
 #include <cassert>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 
@@ -59,11 +59,12 @@ int cufinufft_spread1d(int nf1, cuda_complex<T> *d_fw, int M, T *d_kx, cuda_comp
     }
 
     ier = cuspread1d<T>(d_plan, 1);
-    freegpumemory1d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
 
+
 template <typename T>
 int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
@@ -83,24 +84,15 @@ int cuspread1d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuspread1d_nuptsdriven<T>(nf1, M, d_plan, blksize);
-
-        if (ier != 0) {
-            std::cout << "error: cnufftspread1d_gpu_nuptsdriven" << std::endl;
-            return 1;
-        }
     } break;
     case 2: {
         ier = cuspread1d_subprob<T>(nf1, M, d_plan, blksize);
-
-        if (ier != 0) {
-            std::cout << "error: cnufftspread1d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cout << "error: incorrect method, should be 1,2" << std::endl;
-        return 2;
+        std::cerr << "[cuspread1d] error: incorrect method, should be 1 or 2\n";
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
+
     return ier;
 }
 
@@ -109,8 +101,8 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
     if (d_plan->opts.gpu_sort) {
         int bin_size_x = d_plan->opts.gpu_binsizex;
         if (bin_size_x < 0) {
-            std::cout << "error: invalid binsize (binsizex) = (" << bin_size_x << ")" << std::endl;
-            return 1;
+            std::cerr << "[cuspread1d_nuptsdriven_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
+            return FINUFFT_ERR_BINSIZE_NOTVALID;
         }
 
         int numbins = ceil((T)nf1 / bin_size_x);
@@ -123,10 +115,12 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
         int *d_idxnupts = d_plan->idxnupts;
 
         int pirange = d_plan->spopts.pirange;
-
-        checkCudaErrors(cudaMemset(d_binsize, 0, numbins * sizeof(int)));
+        int ier;
+        if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins * sizeof(int)))))
+            return ier;
         calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024>>>(M, nf1, bin_size_x, numbins, d_binsize, d_kx,
                                                                   d_sortidx, pirange);
+        RETURN_IF_CUDA_ERROR
 
         int n = numbins;
         thrust::device_ptr<int> d_ptr(d_binsize);
@@ -135,9 +129,11 @@ int cuspread1d_nuptsdriven_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan) {
 
         calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024>>>(M, bin_size_x, numbins, d_binstartpts,
                                                                             d_sortidx, d_kx, d_idxnupts, pirange, nf1);
+        RETURN_IF_CUDA_ERROR
     } else {
         int *d_idxnupts = d_plan->idxnupts;
         trivial_global_sort_index_1d<<<(M + 1024 - 1) / 1024, 1024>>>(M, d_idxnupts);
+        RETURN_IF_CUDA_ERROR
     }
 
     return 0;
@@ -166,13 +162,15 @@ int cuspread1d_nuptsdriven(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blks
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven_horner<<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
-                                                                      sigma, d_idxnupts, pirange);
+            spread_1d_nuptsdriven<T, 1><<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
+                                                                     es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            spread_1d_nuptsdriven<<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c,
-                                                               es_beta, d_idxnupts, pirange);
+            spread_1d_nuptsdriven<T, 0><<<blocks, threadsPerBlock>>>(d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1,
+                                                                     es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -187,12 +185,12 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
     which only needs to be done once.
 */
 {
+    int ier;
     int maxsubprobsize = d_plan->opts.gpu_maxsubprobsize;
     int bin_size_x = d_plan->opts.gpu_binsizex;
     if (bin_size_x < 0) {
-        std::cout << "error: invalid binsize (binsizex) = (";
-        std::cout << bin_size_x << ")" << std::endl;
-        return 1;
+        std::cerr << "[cuspread1d_subprob_prop] error: invalid binsize (binsizex) = (" << bin_size_x << ")\n";
+        return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     int numbins = ceil((T)nf1 / bin_size_x);
@@ -210,9 +208,11 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
 
     int pirange = d_plan->spopts.pirange;
 
-    checkCudaErrors(cudaMemset(d_binsize, 0, numbins * sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins * sizeof(int)))))
+        return ier;
     calc_bin_size_noghost_1d<<<(M + 1024 - 1) / 1024, 1024>>>(M, nf1, bin_size_x, numbins, d_binsize, d_kx, d_sortidx,
                                                               pirange);
+    RETURN_IF_CUDA_ERROR
 
     int n = numbins;
     thrust::device_ptr<int> d_ptr(d_binsize);
@@ -221,24 +221,35 @@ int cuspread1d_subprob_prop(int nf1, int M, cufinufft_plan_t<T> *d_plan)
 
     calc_inverse_of_global_sort_idx_1d<<<(M + 1024 - 1) / 1024, 1024>>>(M, bin_size_x, numbins, d_binstartpts,
                                                                         d_sortidx, d_kx, d_idxnupts, pirange, nf1);
+    RETURN_IF_CUDA_ERROR
 
     calc_subprob_1d<<<(M + 1024 - 1) / 1024, 1024>>>(d_binsize, d_numsubprob, maxsubprobsize, numbins);
+    RETURN_IF_CUDA_ERROR
 
     d_ptr = thrust::device_pointer_cast(d_numsubprob);
     d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
     thrust::inclusive_scan(d_ptr, d_ptr + n, d_result);
-    checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)))))
+        return ier;
 
     int totalnumsubprob;
-    checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)));
+    if ((ier =
+             checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost))))
+        return ier;
+    if ((ier = checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)))))
+        return ier;
     map_b_into_subprob_1d<<<(numbins + 1024 - 1) / 1024, 1024>>>(d_subprob_to_bin, d_subprobstartpts, d_numsubprob,
                                                                  numbins);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_subprob_to_bin);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
     assert(d_subprob_to_bin != NULL);
-    if (d_plan->subprob_to_bin != NULL)
-        cudaFree(d_plan->subprob_to_bin);
+    cudaFree(d_plan->subprob_to_bin);
     d_plan->subprob_to_bin = d_subprob_to_bin;
-    assert(d_plan->subprob_to_bin != NULL);
     d_plan->totalnumsubprob = totalnumsubprob;
 
     return 0;
@@ -274,22 +285,25 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 
     size_t sharedplanorysize = (bin_size_x + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
     if (sharedplanorysize > 49152) {
-        std::cout << "error: not enough shared memory" << std::endl;
-        return 1;
+        std::cerr << "[cuspread1d_subprob] error: not enough shared memory\n";
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
     }
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob_horner<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, sigma, d_binstartpts, d_binsize, bin_size_x,
-                d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts, pirange);
+            spread_1d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
+                bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts,
+                pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            spread_1d_subprob<<<totalnumsubprob, 256, sharedplanorysize>>>(
+            spread_1d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize>>>(
                 d_kx, d_c + t * M, d_fw + t * nf1, M, ns, nf1, es_c, es_beta, sigma, d_binstartpts, d_binsize,
                 bin_size_x, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins, d_idxnupts,
                 pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -297,8 +311,15 @@ int cuspread1d_subprob(int nf1, int M, cufinufft_plan_t<T> *d_plan, int blksize)
 }
 
 template int cufinufft_spread1d<float>(int nf1, cuda_complex<float> *d_fw, int M, float *d_kx, cuda_complex<float> *d_c,
-                                       cufinufft_plan_t<float> *d_plan);
+    cufinufft_plan_t<float> *d_plan);
 template int cufinufft_spread1d<double>(int nf1, cuda_complex<double> *d_fw, int M, double *d_kx,
-                                        cuda_complex<double> *d_c, cufinufft_plan_t<double> *d_plan);
+     cuda_complex<double> *d_c, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuspread1d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cuspread1d_nuptsdriven_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_nuptsdriven_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread1d_subprob_prop<float>(int nf1, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread1d_subprob_prop<double>(int nf1, int M, cufinufft_plan_t<double> *d_plan);
+
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/1d/spreadinterp1d.cuh b/src/cuda/1d/spreadinterp1d.cuh
index fcdac5186..77ecf9a43 100644
--- a/src/cuda/1d/spreadinterp1d.cuh
+++ b/src/cuda/1d/spreadinterp1d.cuh
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include <cuda.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/extrema.h>
 
 #include <cufinufft/defs.h>
@@ -15,37 +15,9 @@ namespace spreadinterp {
 /* ------------------------ 1d Spreading Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void spread_1d_nuptsdriven(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
-                                      T es_c, T es_beta, const int *idxnupts, int pirange) {
-    int xstart, xend;
-    int xx, ix;
-    T ker1[MAX_NSPREAD];
-
-    T x_rescaled;
-    T kervalue1;
-    cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-        cnow = c[idxnupts[i]];
-
-        xstart = ceil(x_rescaled - ns / 2.0);
-        xend = floor(x_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        for (xx = xstart; xx <= xend; xx++) {
-            ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-            kervalue1 = ker1[xx - xstart];
-            atomicAdd(&fw[ix].x, cnow.x * kervalue1);
-            atomicAdd(&fw[ix].y, cnow.y * kervalue1);
-        }
-    }
-}
-
-template <typename T>
-__global__ void spread_1d_nuptsdriven_horner(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
-                                             int nf1, T sigma, const int *idxnupts, int pirange) {
+                                      T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) {
     int xx, ix;
     T ker1[MAX_NSPREAD];
 
@@ -58,7 +30,10 @@ __global__ void spread_1d_nuptsdriven_horner(const T *x, const cuda_complex<T> *
         int xend = floor(x_rescaled + ns / 2.0);
 
         T x1 = (T)xstart - x_rescaled;
-        eval_kernel_vec_horner(ker1, x1, ns, sigma);
+        if constexpr (KEREVALMETH == 1)
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+        else
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
 
         for (xx = xstart; xx <= xend; xx++) {
             ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
@@ -105,7 +80,7 @@ __global__ void calc_inverse_of_global_sort_idx_1d(int M, int bin_size_x, int nb
     }
 }
 
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
                                   T es_c, T es_beta, T sigma, const int *binstartpts, const int *bin_size,
                                   int bin_size_x, const int *subprob_to_bin, const int *subprobstartpts,
@@ -144,7 +119,10 @@ __global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_com
         xend = floor(x_rescaled + ns / 2.0) - xoffset;
 
         T x1 = (T)xstart + xoffset - x_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+        if constexpr (KEREVALMETH == 1)
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+        else
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
 
         for (int xx = xstart; xx <= xend; xx++) {
             ix = xx + ceil(ns / 2.0);
@@ -166,72 +144,11 @@ __global__ void spread_1d_subprob(const T *x, const cuda_complex<T> *c, cuda_com
     }
 }
 
-template <typename T>
-__global__ void spread_1d_subprob_horner(const T *x, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
-                                         int nf1, T sigma, const int *binstartpts, const int *bin_size, int bin_size_x,
-                                         const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
-                                         int maxsubprobsize, int nbinx, const int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, xend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0));
-
-    T ker1[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-
-    T x_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = RESCALE(x[idxnupts[idx]], nf1, pirange);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-
-        for (int xx = xstart; xx <= xend; xx++) {
-            ix = xx + ceil(ns / 2.0);
-            if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                break;
-            atomicAdd(&fwshared[ix].x, cnow.x * ker1[xx - xstart]);
-            atomicAdd(&fwshared[ix].y, cnow.y * ker1[xx - xstart]);
-        }
-    }
-    __syncthreads();
-
-    /* write to global memory */
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        ix = xoffset - ceil(ns / 2.0) + k;
-        if (ix < (nf1 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            atomicAdd(&fw[ix].x, fwshared[k].x);
-            atomicAdd(&fw[ix].y, fwshared[k].y);
-        }
-    }
-}
-
 /* --------------------- 1d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns, int nf1,
-                                      T es_c, T es_beta, const int *idxnupts, int pirange) {
+                                      T es_c, T es_beta, T sigma, const int *idxnupts, int pirange) {
     T ker1[MAX_NSPREAD];
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
@@ -243,7 +160,11 @@ __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda
         cnow.y = 0.0;
 
         T x1 = (T)xstart - x_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+        if constexpr (KEREVALMETH == 1)
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+        else
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+
         for (int xx = xstart; xx <= xend; xx++) {
             int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
             T kervalue1 = ker1[xx - xstart];
@@ -255,31 +176,5 @@ __global__ void interp_1d_nuptsdriven(const T *x, cuda_complex<T> *c, const cuda
     }
 }
 
-template <typename T>
-__global__ void interp_1d_nuptsdriven_horner(const T *x, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
-                                             int nf1, T sigma, const int *idxnupts, int pirange) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-        T ker1[MAX_NSPREAD];
-
-        eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma);
-
-        for (int xx = xstart; xx <= xend; xx++) {
-            int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-            cnow.x += fw[ix].x * ker1[xx - xstart];
-            cnow.y += fw[ix].y * ker1[xx - xstart];
-        }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
-    }
-}
-
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/2d/cufinufft2d.cu b/src/cuda/2d/cufinufft2d.cu
index ca809314a..6c73ab636 100644
--- a/src/cuda/2d/cufinufft2d.cu
+++ b/src/cuda/2d/cufinufft2d.cu
@@ -5,7 +5,7 @@
 #include <iostream>
 
 #include <cufft.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/cudeconvolve.h>
 #include <cufinufft/memtransfer.h>
@@ -32,37 +32,36 @@ int cufinufft2d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
 {
     assert(d_plan->spopts.spread_direction == 1);
 
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
         d_plan->c = d_cstart;
         d_plan->fk = d_fkstart;
 
-        checkCudaErrors(
-            cudaMemset(d_plan->fw, 0,
-                       d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>))); // this is needed
+        // this is needed
+        if ((ier = checkCudaErrors(cudaMemset(
+                 d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * sizeof(cuda_complex<T>)))))
+            return ier;
 
         // Step 1: Spread
-        ier = cuspread2d<T>(d_plan, blksize);
-
-        if (ier != 0) {
-            printf("error: cuspread2d, method(%d)\n", d_plan->opts.gpu_method);
+        if ((ier = cuspread2d<T>(d_plan, blksize)))
             return ier;
-        }
 
         // Step 2: FFT
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        cudeconvolve2d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
+            return ier;
     }
 
-    return ier;
+    return 0;
 }
 
 template <typename T>
@@ -82,12 +81,11 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
 {
     assert(d_plan->spopts.spread_direction == 2);
 
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt;
 
@@ -95,20 +93,23 @@ int cufinufft2d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        cudeconvolve2d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve2d<T>(d_plan, blksize)))
+            return ier;
+
         // Step 2: FFT
         cudaDeviceSynchronize();
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        RETURN_IF_CUDA_ERROR
+
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        ier = cuinterp2d<T>(d_plan, blksize);
-        if (ier != 0) {
-            printf("error: cuinterp2d, method(%d)\n", d_plan->opts.gpu_method);
+        if ((ier = cuinterp2d<T>(d_plan, blksize)))
             return ier;
-        }
     }
 
-    return ier;
+    return 0;
 }
 
 template int cufinufft2d1_exec<float>(cuda_complex<float> *d_c, cuda_complex<float> *d_fk,
diff --git a/src/cuda/2d/interp2d_wrapper.cu b/src/cuda/2d/interp2d_wrapper.cu
index 11ba19276..9f6b27fab 100644
--- a/src/cuda/2d/interp2d_wrapper.cu
+++ b/src/cuda/2d/interp2d_wrapper.cu
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include <cuComplex.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
@@ -55,11 +55,12 @@ int cufinufft_interp2d(int nf1, int nf2, cuda_complex<T> *d_fw, int M, T *d_kx,
     }
 
     ier = cuinterp2d<T>(d_plan, 1);
-    freegpumemory2d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
 
+
 template <typename T>
 int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
 /*
@@ -80,21 +81,13 @@ int cuinterp2d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuinterp2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread2d_gpu_nuptsdriven" << std::endl;
-            return 1;
-        }
     } break;
     case 2: {
         ier = cuinterp2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cuinterp2d_subprob" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cout << "error: incorrect method, should be 1 or 2" << std::endl;
-        return 2;
+        std::cerr << "[cuinterp2d] error: incorrect method, should be 1 or 2\n";
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
 
     return ier;
@@ -124,13 +117,17 @@ int cuinterp2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven_horner<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
-                                                                       ns, nf1, nf2, sigma, d_idxnupts, pirange);
+            interp_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
+                                                                      ns, nf1, nf2, es_c, es_beta, sigma, d_idxnupts,
+                                                                      pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            interp_2d_nupts_driven<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns,
-                                                                nf1, nf2, es_c, es_beta, d_idxnupts, pirange);
+            interp_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
+                                                                      ns, nf1, nf2, es_c, es_beta, sigma, d_idxnupts,
+                                                                      pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -170,23 +167,25 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int
         (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
 
     if (sharedplanorysize > 49152) {
-        std::cout << "error: not enough shared memory" << std::endl;
-        return 1;
+        std::cerr << "[cuinterp2d_subprob] error: not enough shared memory\n";
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
     }
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob_horner<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, sigma, d_binstartpts, d_binsize,
-                bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0],
-                numbins[1], d_idxnupts, pirange);
+            interp_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
+                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+                numbins[0], numbins[1], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            interp_2d_subprob<<<totalnumsubprob, 256, sharedplanorysize>>>(
+            interp_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize>>>(
                 d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
                 d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
                 numbins[0], numbins[1], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -194,10 +193,9 @@ int cuinterp2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int
 }
 
 template int cufinufft_interp2d(int nf1, int nf2, cuda_complex<float> *d_fw, int M, float *d_kx, float *d_ky,
-                                cuda_complex<float> *d_c, cufinufft_plan_t<float> *d_plan);
+    cuda_complex<float> *d_c, cufinufft_plan_t<float> *d_plan);
 template int cufinufft_interp2d(int nf1, int nf2, cuda_complex<double> *d_fw, int M, double *d_kx, double *d_ky,
-                                cuda_complex<double> *d_c, cufinufft_plan_t<double> *d_plan);
-
+    cuda_complex<double> *d_c, cufinufft_plan_t<double> *d_plan);
 template int cuinterp2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
 
diff --git a/src/cuda/2d/spread2d_wrapper.cu b/src/cuda/2d/spread2d_wrapper.cu
index a165afaea..a1e895069 100644
--- a/src/cuda/2d/spread2d_wrapper.cu
+++ b/src/cuda/2d/spread2d_wrapper.cu
@@ -3,7 +3,7 @@
 #include <iostream>
 
 #include <cuComplex.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -62,7 +62,7 @@ int cufinufft_spread2d(int nf1, int nf2, cuda_complex<T> *d_fw, int M, T *d_kx,
 
     ier = cuspread2d<T>(d_plan, 1);
 
-    freegpumemory2d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
@@ -87,21 +87,13 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuspread2d_nuptsdriven<T>(nf1, nf2, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread2d_gpu_nuptsdriven" << std::endl;
-            return 1;
-        }
     } break;
     case 2: {
         ier = cuspread2d_subprob<T>(nf1, nf2, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread2d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cout << "error: incorrect method, should be 1,2,3" << std::endl;
-        return 2;
+        std::cerr << "[cuspread2d] error: incorrect method, should be 1 or 2\n";
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
 
     return ier;
@@ -110,13 +102,12 @@ int cuspread2d(cufinufft_plan_t<T> *d_plan, int blksize)
 template <typename T>
 int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan) {
     if (d_plan->opts.gpu_sort) {
-
         int bin_size_x = d_plan->opts.gpu_binsizex;
         int bin_size_y = d_plan->opts.gpu_binsizey;
         if (bin_size_x < 0 || bin_size_y < 0) {
-            std::cout << "error: invalid binsize (binsizex, binsizey) = (";
-            std::cout << bin_size_x << "," << bin_size_y << ")" << std::endl;
-            return 1;
+            std::cerr << "[cuspread2d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey) = (";
+            std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+            return FINUFFT_ERR_BINSIZE_NOTVALID;
         }
 
         int numbins[2];
@@ -132,10 +123,14 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_
         int *d_idxnupts = d_plan->idxnupts;
 
         int pirange = d_plan->spopts.pirange;
+        int ier;
+        if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int)))))
+            return ier;
 
-        checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int)));
         calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024>>>(
             M, nf1, nf2, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binsize, d_kx, d_ky, d_sortidx, pirange);
+        RETURN_IF_CUDA_ERROR
+
         int n = numbins[0] * numbins[1];
         thrust::device_ptr<int> d_ptr(d_binsize);
         thrust::device_ptr<int> d_result(d_binstartpts);
@@ -144,10 +139,12 @@ int cuspread2d_nuptsdriven_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_
         calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024>>>(
             M, bin_size_x, bin_size_y, numbins[0], numbins[1], d_binstartpts, d_sortidx, d_kx, d_ky, d_idxnupts,
             pirange, nf1, nf2);
+        RETURN_IF_CUDA_ERROR
     } else {
         int *d_idxnupts = d_plan->idxnupts;
 
         trivial_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024>>>(M, d_idxnupts);
+        RETURN_IF_CUDA_ERROR
     }
 
     return 0;
@@ -176,13 +173,17 @@ int cuspread2d_nuptsdriven(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan,
     blocks.y = 1;
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven_horner<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
-                                                                       ns, nf1, nf2, sigma, d_idxnupts, pirange);
+            spread_2d_nupts_driven<T, 1><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
+                                                                      ns, nf1, nf2, es_c, es_beta, sigma, d_idxnupts,
+                                                                      pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            spread_2d_nupts_driven<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns,
-                                                                nf1, nf2, es_c, es_beta, d_idxnupts, pirange);
+            spread_2d_nupts_driven<T, 0><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M,
+                                                                      ns, nf1, nf2, es_c, es_beta, sigma, d_idxnupts,
+                                                                      pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -201,9 +202,9 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     int bin_size_x = d_plan->opts.gpu_binsizex;
     int bin_size_y = d_plan->opts.gpu_binsizey;
     if (bin_size_x < 0 || bin_size_y < 0) {
-        std::cout << "error: invalid binsize (binsizex, binsizey) = (";
-        std::cout << bin_size_x << "," << bin_size_y << ")" << std::endl;
-        return 1;
+        std::cerr << "[cuspread2d_subprob_prop] error: invalid binsize (binsizex, binsizey) = (";
+        std::cerr << bin_size_x << "," << bin_size_y << ")" << std::endl;
+        return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
     int numbins[2];
     numbins[0] = ceil((T)nf1 / bin_size_x);
@@ -222,10 +223,13 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     int *d_subprob_to_bin = NULL;
 
     int pirange = d_plan->spopts.pirange;
+    int ier;
+    if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int)))))
+        return ier;
 
-    checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * sizeof(int)));
     calc_bin_size_noghost_2d<<<(M + 1024 - 1) / 1024, 1024>>>(M, nf1, nf2, bin_size_x, bin_size_y, numbins[0],
                                                               numbins[1], d_binsize, d_kx, d_ky, d_sortidx, pirange);
+    RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1];
     thrust::device_ptr<int> d_ptr(d_binsize);
@@ -235,24 +239,34 @@ int cuspread2d_subprob_prop(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan
     calc_inverse_of_global_sort_index_2d<<<(M + 1024 - 1) / 1024, 1024>>>(M, bin_size_x, bin_size_y, numbins[0],
                                                                           numbins[1], d_binstartpts, d_sortidx, d_kx,
                                                                           d_ky, d_idxnupts, pirange, nf1, nf2);
+    RETURN_IF_CUDA_ERROR
 
     calc_subprob_2d<<<(M + 1024 - 1) / 1024, 1024>>>(d_binsize, d_numsubprob, maxsubprobsize, numbins[0] * numbins[1]);
+    RETURN_IF_CUDA_ERROR
 
     d_ptr = thrust::device_pointer_cast(d_numsubprob);
     d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
     thrust::inclusive_scan(d_ptr, d_ptr + n, d_result);
-    checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)))))
+        return ier;
 
     int totalnumsubprob;
-    checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost))))
+        return ier;
+    if ((ier = checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)))))
+        return ier;
     map_b_into_subprob_2d<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024>>>(d_subprob_to_bin, d_subprobstartpts,
                                                                                  d_numsubprob, numbins[0] * numbins[1]);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_subprob_to_bin);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
     assert(d_subprob_to_bin != NULL);
-    if (d_plan->subprob_to_bin != NULL)
-        cudaFree(d_plan->subprob_to_bin);
+    cudaFree(d_plan->subprob_to_bin);
     d_plan->subprob_to_bin = d_subprob_to_bin;
-    assert(d_plan->subprob_to_bin != NULL);
     d_plan->totalnumsubprob = totalnumsubprob;
 
     return 0;
@@ -293,33 +307,42 @@ int cuspread2d_subprob(int nf1, int nf2, int M, cufinufft_plan_t<T> *d_plan, int
     size_t sharedplanorysize =
         (bin_size_x + 2 * (int)ceil(ns / 2.0)) * (bin_size_y + 2 * (int)ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
     if (sharedplanorysize > 49152) {
-        std::cout << "error: not enough shared memory" << std::endl;
-        return 1;
+        std::cerr << "[cuspread2d_subprob] error: not enough shared memory\n";
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
     }
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob_horner<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, sigma, d_binstartpts, d_binsize,
-                bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize, numbins[0],
-                numbins[1], d_idxnupts, pirange);
+            spread_2d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
+                d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
+                numbins[0], numbins[1], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            spread_2d_subprob<<<totalnumsubprob, 256, sharedplanorysize>>>(
+            spread_2d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize>>>(
                 d_kx, d_ky, d_c + t * M, d_fw + t * nf1 * nf2, M, ns, nf1, nf2, es_c, es_beta, sigma, d_binstartpts,
                 d_binsize, bin_size_x, bin_size_y, d_subprob_to_bin, d_subprobstartpts, d_numsubprob, maxsubprobsize,
                 numbins[0], numbins[1], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
     return 0;
 }
 
+
+template int cuspread2d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
+template int cuspread2d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cuspread2d_subprob_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_subprob_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread2d_nuptsdriven_prop<float>(int nf1, int nf2, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread2d_nuptsdriven_prop<double>(int nf1, int nf2, int M, cufinufft_plan_t<double> *d_plan);
 template int cufinufft_spread2d<float>(int nf1, int nf2, cuda_complex<float> *d_fw, int M, float *d_kx, float *d_ky, 
                                         cuda_complex<float> *d_c, cufinufft_plan_t<float> *d_plan);
 template int cufinufft_spread2d<double>(int nf1, int nf2, cuda_complex<double> *d_fw, int M, double *d_kx, double *d_ky,
                                         cuda_complex<double> *d_c, cufinufft_plan_t<double> *d_plan);
-                                        
+
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/2d/spreadinterp2d.cuh b/src/cuda/2d/spreadinterp2d.cuh
index b3e862d5e..f6f3341d3 100644
--- a/src/cuda/2d/spreadinterp2d.cuh
+++ b/src/cuda/2d/spreadinterp2d.cuh
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include <cuda.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/extrema.h>
 
 #include <cufinufft/defs.h>
@@ -15,9 +15,10 @@ namespace spreadinterp {
 /* ------------------------ 2d Spreading Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
 
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, const int *idxnupts, int pirange) {
+                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts,
+                                       int pirange) {
     int xstart, ystart, xend, yend;
     int xx, yy, ix, iy;
     int outidx;
@@ -39,8 +40,15 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple
 
         T x1 = (T)xstart - x_rescaled;
         T y1 = (T)ystart - y_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+            eval_kernel_vec_horner(ker2, y1, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+        }
+
         for (yy = ystart; yy <= yend; yy++) {
             for (xx = xstart; xx <= xend; xx++) {
                 ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
@@ -55,47 +63,6 @@ __global__ void spread_2d_nupts_driven(const T *x, const T *y, const cuda_comple
     }
 }
 
-template <typename T>
-__global__ void spread_2d_nupts_driven_horner(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw,
-                                              int M, int ns, int nf1, int nf2, T sigma, const int *idxnupts,
-                                              int pirange) {
-    int xx, yy, ix, iy;
-    int outidx;
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker1val, ker2val;
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
-        cnow = c[idxnupts[i]];
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        eval_kernel_vec_horner(ker1, x1, ns, sigma);
-        eval_kernel_vec_horner(ker2, y1, ns, sigma);
-
-        for (yy = ystart; yy <= yend; yy++) {
-            for (xx = xstart; xx <= xend; xx++) {
-                ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                outidx = ix + iy * nf1;
-                ker1val = ker1[xx - xstart];
-                ker2val = ker2[yy - ystart];
-                T kervalue = ker1val * ker2val;
-                atomicAdd(&fw[outidx].x, cnow.x * kervalue);
-                atomicAdd(&fw[outidx].y, cnow.y * kervalue);
-            }
-        }
-    }
-}
-
 /* Kernels for SubProb Method */
 // SubProb properties
 template <typename T>
@@ -144,7 +111,7 @@ __global__ void calc_inverse_of_global_sort_index_2d(int M, int bin_size_x, int
     }
 }
 
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
                                   int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
                                   int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
@@ -190,8 +157,14 @@ __global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T>
 
         T x1 = (T)xstart + xoffset - x_rescaled;
         T y1 = (T)ystart + yoffset - y_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+            eval_kernel_vec_horner(ker2, y1, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+        }
 
         for (int yy = ystart; yy <= yend; yy++) {
             iy = yy + ceil(ns / 2.0);
@@ -209,89 +182,8 @@ __global__ void spread_2d_subprob(const T *x, const T *y, const cuda_complex<T>
             }
         }
     }
-    __syncthreads();
-    /* write to global memory */
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
-            atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
-        }
-    }
-}
-
-template <typename T>
-__global__ void spread_2d_subprob_horner(const T *x, const T *y, const cuda_complex<T> *c, cuda_complex<T> *fw, int M,
-                                         int ns, int nf1, int nf2, T sigma, const int *binstartpts, const int *bin_size,
-                                         int bin_size_x, int bin_size_y, int *subprob_to_bin, int *subprobstartpts,
-                                         const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
-                                         const int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy, outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
 
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = RESCALE(x[idxnupts[idx]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[idx]], nf2, pirange);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            iy = yy + ceil(ns / 2.0);
-            if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
-                break;
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                    break;
-                outidx = ix + iy * (bin_size_x + (int)ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2);
-                atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2);
-            }
-        }
-    }
     __syncthreads();
-
     /* write to global memory */
     for (int k = threadIdx.x; k < N; k += blockDim.x) {
         int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
@@ -311,9 +203,10 @@ __global__ void spread_2d_subprob_horner(const T *x, const T *y, const cuda_comp
 
 /* --------------------- 2d Interpolation Kernels ----------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M,
-                                       int ns, int nf1, int nf2, T es_c, T es_beta, const int *idxnupts, int pirange) {
+                                       int ns, int nf1, int nf2, T es_c, T es_beta, T sigma, const int *idxnupts,
+                                       int pirange) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
@@ -330,46 +223,13 @@ __global__ void interp_2d_nupts_driven(const T *x, const T *y, cuda_complex<T> *
 
         T x1 = (T)xstart - x_rescaled;
         T y1 = (T)ystart - y_rescaled;
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                int inidx = ix + iy * nf1;
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fw[inidx].x * kervalue1 * kervalue2;
-                cnow.y += fw[inidx].y * kervalue1 * kervalue2;
-            }
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+            eval_kernel_vec_horner(ker2, y1, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
         }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
-    }
-}
-
-template <typename T>
-__global__ void interp_2d_nupts_driven_horner(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw,
-                                              int M, int ns, int nf1, int nf2, T sigma, const int *idxnupts,
-                                              int pirange) {
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-        T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-
-        eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma);
 
         for (int yy = ystart; yy <= yend; yy++) {
             T kervalue2 = ker2[yy - ystart];
@@ -388,7 +248,7 @@ __global__ void interp_2d_nupts_driven_horner(const T *x, const T *y, cuda_compl
 }
 
 /* Kernels for Subprob Method */
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M, int ns,
                                   int nf1, int nf2, T es_c, T es_beta, T sigma, int *binstartpts, const int *bin_size,
                                   int bin_size_x, int bin_size_y, int *subprob_to_bin, const int *subprobstartpts,
@@ -445,82 +305,13 @@ __global__ void interp_2d_subprob(const T *x, const T *y, cuda_complex<T> *c, co
 
         T x1 = (T)xstart + xoffset - x_rescaled;
         T y1 = (T)ystart + yoffset - y_rescaled;
-
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        for (int yy = ystart; yy <= yend; yy++) {
-            T kervalue2 = ker2[yy - ystart];
-            for (int xx = xstart; xx <= xend; xx++) {
-                ix = xx + ceil(ns / 2.0);
-                iy = yy + ceil(ns / 2.0);
-                outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-                T kervalue1 = ker1[xx - xstart];
-                cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
-                cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
-            }
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+            eval_kernel_vec_horner(ker2, y1, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
         }
-        c[idxnupts[idx]] = cnow;
-    }
-}
-
-template <typename T>
-__global__ void interp_2d_subprob_horner(const T *x, const T *y, cuda_complex<T> *c, const cuda_complex<T> *fw, int M,
-                                         int ns, int nf1, int nf2, T sigma, const int *binstartpts, const int *bin_size,
-                                         int bin_size_x, int bin_size_y, const int *subprob_to_bin,
-                                         const int *subprobstartpts, const int *numsubprob, int maxsubprobsize,
-                                         int nbinx, int nbiny, int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = (bidx / nbinx) * bin_size_y;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0));
-
-    for (int k = threadIdx.x; k < N; k += blockDim.x) {
-        int i = k % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = k / (bin_size_x + 2 * ceil(ns / 2.0));
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            outidx = ix + iy * nf1;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2);
-            fwshared[sharedidx].x = fw[outidx].x;
-            fwshared[sharedidx].y = fw[outidx].y;
-        }
-    }
-    __syncthreads();
-
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = RESCALE(x[idxnupts[idx]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[idx]], nf2, pirange);
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
 
         for (int yy = ystart; yy <= yend; yy++) {
             T kervalue2 = ker2[yy - ystart];
@@ -528,7 +319,6 @@ __global__ void interp_2d_subprob_horner(const T *x, const T *y, cuda_complex<T>
                 ix = xx + ceil(ns / 2.0);
                 iy = yy + ceil(ns / 2.0);
                 outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2);
-
                 T kervalue1 = ker1[xx - xstart];
                 cnow.x += fwshared[outidx].x * kervalue1 * kervalue2;
                 cnow.y += fwshared[outidx].y * kervalue1 * kervalue2;
diff --git a/src/cuda/3d/cufinufft3d.cu b/src/cuda/3d/cufinufft3d.cu
index 131cebe9d..50250b44f 100644
--- a/src/cuda/3d/cufinufft3d.cu
+++ b/src/cuda/3d/cufinufft3d.cu
@@ -4,7 +4,7 @@
 #include <iostream>
 
 #include <cufft.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/cudeconvolve.h>
 #include <cufinufft/memtransfer.h>
@@ -30,34 +30,34 @@ int cufinufft3d1_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
 
         d_plan->c = d_cstart;
         d_plan->fk = d_fkstart;
 
-        checkCudaErrors(cudaMemset(
-            d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * d_plan->nf3 * sizeof(cuda_complex<T>)));
+        if ((ier = checkCudaErrors(
+                 cudaMemset(d_plan->fw, 0,
+                            d_plan->maxbatchsize * d_plan->nf1 * d_plan->nf2 * d_plan->nf3 * sizeof(cuda_complex<T>)))))
+            return ier;
 
         // Step 1: Spread
-        ier = cuspread3d<T>(d_plan, blksize);
-
-        if (ier != 0) {
-            printf("error: cuspread3d, method(%d)\n", d_plan->opts.gpu_method);
+        if ((ier = cuspread3d<T>(d_plan, blksize)))
             return ier;
-        }
 
         // Step 2: FFT
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        cudeconvolve3d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
+            return ier;
     }
 
     return 0;
@@ -78,12 +78,11 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
     Melody Shih 07/25/19
 */
 {
-    int blksize;
     int ier;
     cuda_complex<T> *d_fkstart;
     cuda_complex<T> *d_cstart;
     for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
-        blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+        int blksize = min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
         d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
         d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms * d_plan->mt * d_plan->mu;
 
@@ -91,19 +90,19 @@ int cufinufft3d2_exec(cuda_complex<T> *d_c, cuda_complex<T> *d_fk, cufinufft_pla
         d_plan->fk = d_fkstart;
 
         // Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
-        cudeconvolve3d<T>(d_plan, blksize);
+        if ((ier = cudeconvolve3d<T>(d_plan, blksize)))
+            return ier;
 
         // Step 2: FFT
         cudaDeviceSynchronize();
-        cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        RETURN_IF_CUDA_ERROR
+        cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+        if (cufft_status != CUFFT_SUCCESS)
+            return FINUFFT_ERR_CUDA_FAILURE;
 
         // Step 3: deconvolve and shuffle
-        ier = cuinterp3d<T>(d_plan, blksize);
-
-        if (ier != 0) {
-            printf("error: cuinterp3d, method(%d)\n", d_plan->opts.gpu_method);
+        if ((ier = cuinterp3d<T>(d_plan, blksize)))
             return ier;
-        }
     }
 
     return 0;
diff --git a/src/cuda/3d/interp3d_wrapper.cu b/src/cuda/3d/interp3d_wrapper.cu
index 2e3733705..a9271a372 100644
--- a/src/cuda/3d/interp3d_wrapper.cu
+++ b/src/cuda/3d/interp3d_wrapper.cu
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include <cuComplex.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/memtransfer.h>
 #include <cufinufft/spreadinterp.h>
@@ -58,7 +58,7 @@ int cufinufft_interp3d(int nf1, int nf2, int nf3, cuda_complex<T> *d_fw, int M,
 
     ier = cuinterp3d<T>(d_plan, 1);
 
-    freegpumemory3d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
@@ -84,21 +84,13 @@ int cuinterp3d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuinterp3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread3d_gpu_nuptsdriven" << std::endl;
-            return 1;
-        }
     } break;
     case 2: {
         ier = cuinterp3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-        if (ier != 0) {
-            std::cout << "error: cnufftspread3d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cout << "error: incorrect method, should be 1,2" << std::endl;
-        return 2;
+        std::cerr << "[cuinterp3d] error: incorrect method, should be 1,2\n";
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
 
     return ier;
@@ -130,21 +122,24 @@ int cuinterp3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T>
 
     if (d_plan->opts.gpu_kerevalmeth) {
         for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven_horner<<<blocks, threadsPerBlock, 0, 0>>>(d_kx, d_ky, d_kz, d_c + t * M,
-                                                                             d_fw + t * nf1 * nf2 * nf3, M, ns, nf1,
-                                                                             nf2, nf3, sigma, d_idxnupts, pirange);
+            interp_3d_nupts_driven<T, 1>
+                <<<blocks, threadsPerBlock, 0, 0>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns,
+                                                    nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            interp_3d_nupts_driven<<<blocks, threadsPerBlock, 0, 0>>>(d_kx, d_ky, d_kz, d_c + t * M,
-                                                                      d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
-                                                                      es_c, es_beta, d_idxnupts, pirange);
+            interp_3d_nupts_driven<T, 0>
+                <<<blocks, threadsPerBlock, 0, 0>>>(d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns,
+                                                    nf1, nf2, nf3, es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
     return 0;
 }
 
+    
 template <typename T>
 int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan, int blksize) {
     int ns = d_plan->spopts.nspread; // psi's support in terms of number of cells
@@ -180,21 +175,23 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
     size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
                                (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
     if (sharedplanorysize > 49152) {
-        std::cout << "error: not enough shared memory" << std::endl;
-        return 1;
+        std::cerr << "[cuinterp3d_subprob] error: not enough shared memory\n";
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
     }
 
     for (int t = 0; t < blksize; t++) {
         if (d_plan->opts.gpu_kerevalmeth == 1) {
-            interp_3d_subprob_horner<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob,
-                maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            interp_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
+                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
+                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         } else {
-            interp_3d_subprob<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta,
+            interp_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
                 d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
                 d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -202,11 +199,10 @@ int cuinterp3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
 }
 
 template int cufinufft_interp3d(int nf1, int nf2, int nf3, cuda_complex<float> *d_fw, int M, float *d_kx, float *d_ky,
-                                float *d_kz, cuda_complex<float> *d_c, cufinufft_plan_t<float> *d_plan);
+    float *d_kz, cuda_complex<float> *d_c, cufinufft_plan_t<float> *d_plan);
 template int cufinufft_interp3d(int nf1, int nf2, int nf3, cuda_complex<double> *d_fw, int M, double *d_kx,
-                                double *d_ky, double *d_kz, cuda_complex<double> *d_c,
-                                cufinufft_plan_t<double> *d_plan);
-
+    double *d_ky, double *d_kz, cuda_complex<double> *d_c,
+    cufinufft_plan_t<double> *d_plan);
 template int cuinterp3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuinterp3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
 
diff --git a/src/cuda/3d/spread3d_wrapper.cu b/src/cuda/3d/spread3d_wrapper.cu
index 9ae0d4d56..154fbe7cd 100644
--- a/src/cuda/3d/spread3d_wrapper.cu
+++ b/src/cuda/3d/spread3d_wrapper.cu
@@ -3,7 +3,7 @@
 #include <iostream>
 
 #include <cuComplex.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <thrust/device_ptr.h>
 #include <thrust/scan.h>
 
@@ -67,7 +67,7 @@ int cufinufft_spread3d(int nf1, int nf2, int nf3, cuda_complex<T> *d_fw, int M,
     }
 
     ier = cuspread3d<T>(d_plan, 1);
-    freegpumemory3d<T>(d_plan);
+    freegpumemory<T>(d_plan);
 
     return ier;
 }
@@ -94,35 +94,22 @@ int cuspread3d(cufinufft_plan_t<T> *d_plan, int blksize)
     switch (d_plan->opts.gpu_method) {
     case 1: {
         ier = cuspread3d_nuptsdriven<T>(nf1, nf2, nf3, M, d_plan, blksize);
-
-        if (ier != 0) {
-            std::cout << "error: cnufftspread3d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     case 2: {
         ier = cuspread3d_subprob<T>(nf1, nf2, nf3, M, d_plan, blksize);
-
-        if (ier != 0) {
-            std::cout << "error: cnufftspread3d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     case 4: {
         ier = cuspread3d_blockgather<T>(nf1, nf2, nf3, M, d_plan, blksize);
-
-        if (ier != 0) {
-            std::cout << "error: cnufftspread3d_gpu_subprob" << std::endl;
-            return 1;
-        }
     } break;
     default:
-        std::cerr << "error: incorrect method, should be 1,2,4" << std::endl;
-        return 2;
+        std::cerr << "[cuspread3d] error: incorrect method, should be 1,2,4" << std::endl;
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
+
     return ier;
 }
 
+     
 template <typename T>
 int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_plan) {
     if (d_plan->opts.gpu_sort) {
@@ -130,9 +117,9 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
         int bin_size_y = d_plan->opts.gpu_binsizey;
         int bin_size_z = d_plan->opts.gpu_binsizez;
         if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-            std::cout << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
-            std::cout << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-            return 1;
+            std::cerr << "[cuspread3d_nuptsdriven_prop] error: invalid binsize (binsizex, binsizey, binsizez) = (";
+            std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
+            return FINUFFT_ERR_BINSIZE_NOTVALID;
         }
 
         int numbins[3];
@@ -151,10 +138,13 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
 
         int pirange = d_plan->spopts.pirange;
 
-        checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
+        int ier;
+        if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)))))
+            return ier;
         calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024>>>(M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z,
                                                                   numbins[0], numbins[1], numbins[2], d_binsize, d_kx,
                                                                   d_ky, d_kz, d_sortidx, pirange);
+        RETURN_IF_CUDA_ERROR
 
         int n = numbins[0] * numbins[1] * numbins[2];
         thrust::device_ptr<int> d_ptr(d_binsize);
@@ -164,10 +154,11 @@ int cuspread3d_nuptsdriven_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
         calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024>>>(
             M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx,
             d_ky, d_kz, d_idxnupts, pirange, nf1, nf2, nf3);
+        RETURN_IF_CUDA_ERROR
     } else {
         int *d_idxnupts = d_plan->idxnupts;
-
         trivial_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024>>>(M, d_idxnupts);
+        RETURN_IF_CUDA_ERROR
     }
 
     return 0;
@@ -198,15 +189,17 @@ int cuspread3d_nuptsdriven(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T>
 
     if (d_plan->opts.gpu_kerevalmeth == 1) {
         for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven_horner<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_kz, d_c + t * M,
-                                                                       d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
-                                                                       sigma, d_idxnupts, pirange);
+            spread_3d_nupts_driven<T, 1><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_kz, d_c + t * M,
+                                                                      d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+                                                                      es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     } else {
         for (int t = 0; t < blksize; t++) {
-            spread_3d_nupts_driven<<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_kz, d_c + t * M,
-                                                                d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c,
-                                                                es_beta, d_idxnupts, pirange);
+            spread_3d_nupts_driven<T, 0><<<blocks, threadsPerBlock>>>(d_kx, d_ky, d_kz, d_c + t * M,
+                                                                      d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3,
+                                                                      es_c, es_beta, sigma, d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -227,13 +220,12 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
 
     int numobins[3];
     if (nf1 % o_bin_size_x != 0 || nf2 % o_bin_size_y != 0 || nf3 % o_bin_size_z != 0) {
-        std::cout << "error: mod(nf1, opts.gpu_obinsizex) != 0" << std::endl;
-        std::cout << "       mod(nf2, opts.gpu_obinsizey) != 0" << std::endl;
-        std::cout << "       mod(nf3, opts.gpu_obinsizez) != 0" << std::endl;
-        std::cout << "error: (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" << std::endl;
-        std::cout << "error: (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
+        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+        std::cerr << "       mod(nf(1|2|3), opts.gpu_obinsize(x|y|z)) != 0" << std::endl;
+        std::cerr << "       (nf1, nf2, nf3) = (" << nf1 << ", " << nf2 << ", " << nf3 << ")" << std::endl;
+        std::cerr << "       (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
                   << o_bin_size_z << ")" << std::endl;
-        return 1;
+        return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     numobins[0] = ceil((T)nf1 / o_bin_size_x);
@@ -244,14 +236,13 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
     int bin_size_y = d_plan->opts.gpu_binsizey;
     int bin_size_z = d_plan->opts.gpu_binsizez;
     if (o_bin_size_x % bin_size_x != 0 || o_bin_size_y % bin_size_y != 0 || o_bin_size_z % bin_size_z != 0) {
-        std::cout << "error: mod(ops.gpu_obinsizex, opts.gpu_binsizex) != 0" << std::endl;
-        std::cout << "       mod(ops.gpu_obinsizey, opts.gpu_binsizey) != 0" << std::endl;
-        std::cout << "       mod(ops.gpu_obinsizez, opts.gpu_binsizez) != 0" << std::endl;
-        std::cout << "error: (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " << bin_size_y << ", "
+        std::cerr << "[cuspread3d_blockgather_prop] error:\n";
+        std::cerr << "      mod(ops.gpu_obinsize(x|y|z), opts.gpu_binsize(x|y|z)) != 0" << std::endl;
+        std::cerr << "      (binsizex, binsizey, binsizez) = (" << bin_size_x << ", " << bin_size_y << ", "
                   << bin_size_z << ")" << std::endl;
-        std::cout << "error: (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
+        std::cerr << "      (obinsizex, obinsizey, obinsizez) = (" << o_bin_size_x << ", " << o_bin_size_y << ", "
                   << o_bin_size_z << ")" << std::endl;
-        return 1;
+        return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     int binsperobinx, binsperobiny, binsperobinz;
@@ -275,10 +266,13 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
     int *d_subprobstartpts = d_plan->subprobstartpts;
     int *d_subprob_to_bin = NULL;
 
-    checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
+    int ier;
+    if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)))))
+        return ier;
     locate_nupts_to_bins_ghost<<<(M + 1024 - 1) / 1024, 1024>>>(
         M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
         binsperobinz, d_binsize, d_kx, d_ky, d_kz, d_sortidx, pirange, nf1, nf2, nf3);
+    RETURN_IF_CUDA_ERROR
 
     threadsPerBlock.x = 8;
     threadsPerBlock.y = 8;
@@ -290,20 +284,30 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
 
     fill_ghost_bins<<<blocks, threadsPerBlock>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1],
                                                  numobins[2], d_binsize);
+    RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1] * numbins[2];
     thrust::device_ptr<int> d_ptr(d_binsize);
     thrust::device_ptr<int> d_result(d_binstartpts + 1);
     thrust::inclusive_scan(d_ptr, d_ptr + n, d_result);
-    checkCudaErrors(cudaMemset(d_binstartpts, 0, sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemset(d_binstartpts, 0, sizeof(int)))))
+        return ier;
 
     int totalNUpts;
-    checkCudaErrors(cudaMemcpy(&totalNUpts, &d_binstartpts[n], sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMalloc(&d_idxnupts, totalNUpts * sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemcpy(&totalNUpts, &d_binstartpts[n], sizeof(int), cudaMemcpyDeviceToHost))))
+        return ier;
+    if ((ier = checkCudaErrors(cudaMalloc(&d_idxnupts, totalNUpts * sizeof(int)))))
+        return ier;
 
     calc_inverse_of_global_sort_index_ghost<<<(M + 1024 - 1) / 1024, 1024>>>(
         M, bin_size_x, bin_size_y, bin_size_z, numobins[0], numobins[1], numobins[2], binsperobinx, binsperobiny,
         binsperobinz, d_binstartpts, d_sortidx, d_kx, d_ky, d_kz, d_idxnupts, pirange, nf1, nf2, nf3);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_idxnupts);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
 
     threadsPerBlock.x = 2;
     threadsPerBlock.y = 2;
@@ -315,8 +319,14 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
 
     ghost_bin_pts_index<<<blocks, threadsPerBlock>>>(binsperobinx, binsperobiny, binsperobinz, numobins[0], numobins[1],
                                                      numobins[2], d_binsize, d_idxnupts, d_binstartpts, M);
-    if (d_plan->idxnupts != NULL)
-        cudaFree(d_plan->idxnupts);
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_idxnupts);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
+    cudaFree(d_plan->idxnupts);
     d_plan->idxnupts = d_idxnupts;
 
     /* --------------------------------------------- */
@@ -326,20 +336,31 @@ int cuspread3d_blockgather_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan
     calc_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024>>>(binsperobinx, binsperobiny, binsperobinz, d_binsize,
                                                         d_numsubprob, maxsubprobsize,
                                                         numobins[0] * numobins[1] * numobins[2]);
+    RETURN_IF_CUDA_ERROR
 
     n = numobins[0] * numobins[1] * numobins[2];
     d_ptr = thrust::device_pointer_cast(d_numsubprob);
     d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
     thrust::inclusive_scan(d_ptr, d_ptr + n, d_result);
-    checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)));
+    if ((ier = checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)))))
+        return ier;
 
     int totalnumsubprob;
-    checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)));
+    if ((ier =
+             checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost))))
+        return ier;
+    if ((ier = checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)))))
+        return ier;
     map_b_into_subprob_3d_v1<<<(n + 1024 - 1) / 1024, 1024>>>(d_subprob_to_bin, d_subprobstartpts, d_numsubprob, n);
+    err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_subprob_to_bin);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
     assert(d_subprob_to_bin != NULL);
-    if (d_plan->subprob_to_bin != NULL)
-        cudaFree(d_plan->subprob_to_bin);
+    cudaFree(d_plan->subprob_to_bin);
     d_plan->subprob_to_bin = d_subprob_to_bin;
     d_plan->totalnumsubprob = totalnumsubprob;
 
@@ -384,29 +405,27 @@ int cuspread3d_blockgather(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T>
     int totalnumsubprob = d_plan->totalnumsubprob;
     int *d_subprob_to_bin = d_plan->subprob_to_bin;
 
+    size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
+    if (sharedplanorysize > 49152) {
+        std::cerr << "[cuspread3d_blockgather] error: not enough shared memory" << std::endl;
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
+    }
+
     for (int t = 0; t < blksize; t++) {
         if (d_plan->opts.gpu_kerevalmeth == 1) {
-            size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
-            if (sharedplanorysize > 49152) {
-                std::cout << "error: not enough shared memory" << std::endl;
-                return 1;
-            }
-            spread_3d_block_gather_horner<<<totalnumsubprob, 64, sharedplanorysize>>>(
+            spread_3d_block_gather<T, 1><<<totalnumsubprob, 64, sharedplanorysize>>>(
                 d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
                 d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
                 d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts,
                 pirange);
+            RETURN_IF_CUDA_ERROR
         } else {
-            size_t sharedplanorysize = obin_size_x * obin_size_y * obin_size_z * sizeof(cuda_complex<T>);
-            if (sharedplanorysize > 49152) {
-                std::cout << "error: not enough shared memory" << std::endl;
-                return 1;
-            }
-            spread_3d_block_gather<<<totalnumsubprob, 64, sharedplanorysize>>>(
+            spread_3d_block_gather<T, 0><<<totalnumsubprob, 64, sharedplanorysize>>>(
                 d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta, sigma,
                 d_binstartpts, obin_size_x, obin_size_y, obin_size_z, binsperobinx * binsperobiny * binsperobinz,
                 d_subprob_to_bin, d_subprobstartpts, maxsubprobsize, numobins[0], numobins[1], numobins[2], d_idxnupts,
                 pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
@@ -420,9 +439,9 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T
     int bin_size_y = d_plan->opts.gpu_binsizey;
     int bin_size_z = d_plan->opts.gpu_binsizez;
     if (bin_size_x < 0 || bin_size_y < 0 || bin_size_z < 0) {
-        std::cout << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
-        std::cout << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
-        return 1;
+        std::cerr << "error: invalid binsize (binsizex, binsizey, binsizez) = (";
+        std::cerr << bin_size_x << "," << bin_size_y << "," << bin_size_z << ")" << std::endl;
+        return FINUFFT_ERR_BINSIZE_NOTVALID;
     }
 
     int numbins[3];
@@ -444,10 +463,13 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T
     int *d_subprob_to_bin = NULL;
     int pirange = d_plan->spopts.pirange;
 
-    checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
+    int ier;
+    if ((ier = checkCudaErrors(cudaMemset(d_binsize, 0, numbins[0] * numbins[1] * numbins[2] * sizeof(int)))))
+        return ier;
     calc_bin_size_noghost_3d<<<(M + 1024 - 1) / 1024, 1024>>>(M, nf1, nf2, nf3, bin_size_x, bin_size_y, bin_size_z,
                                                               numbins[0], numbins[1], numbins[2], d_binsize, d_kx, d_ky,
                                                               d_kz, d_sortidx, pirange);
+    RETURN_IF_CUDA_ERROR
 
     int n = numbins[0] * numbins[1] * numbins[2];
     thrust::device_ptr<int> d_ptr(d_binsize);
@@ -457,22 +479,32 @@ int cuspread3d_subprob_prop(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T
     calc_inverse_of_global_sort_index_3d<<<(M + 1024 - 1) / 1024, 1024>>>(
         M, bin_size_x, bin_size_y, bin_size_z, numbins[0], numbins[1], numbins[2], d_binstartpts, d_sortidx, d_kx, d_ky,
         d_kz, d_idxnupts, pirange, nf1, nf2, nf3);
+    RETURN_IF_CUDA_ERROR
     /* --------------------------------------------- */
     //        Determining Subproblem properties      //
     /* --------------------------------------------- */
     calc_subprob_3d_v2<<<(M + 1024 - 1) / 1024, 1024>>>(d_binsize, d_numsubprob, maxsubprobsize,
                                                         numbins[0] * numbins[1] * numbins[2]);
+    RETURN_IF_CUDA_ERROR
 
     d_ptr = thrust::device_pointer_cast(d_numsubprob);
     d_result = thrust::device_pointer_cast(d_subprobstartpts + 1);
     thrust::inclusive_scan(d_ptr, d_ptr + n, d_result);
-    checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int)));
-
     int totalnumsubprob;
-    checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost));
-    checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int)));
+    if (checkCudaErrors(cudaMemset(d_subprobstartpts, 0, sizeof(int))) ||
+        checkCudaErrors(cudaMemcpy(&totalnumsubprob, &d_subprobstartpts[n], sizeof(int), cudaMemcpyDeviceToHost)) ||
+        checkCudaErrors(cudaMalloc(&d_subprob_to_bin, totalnumsubprob * sizeof(int))))
+        return FINUFFT_ERR_CUDA_FAILURE;
+
     map_b_into_subprob_3d_v2<<<(numbins[0] * numbins[1] + 1024 - 1) / 1024, 1024>>>(
         d_subprob_to_bin, d_subprobstartpts, d_numsubprob, numbins[0] * numbins[1] * numbins[2]);
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        fprintf(stderr, "[%s] Error: %s\n", __func__, cudaGetErrorString(err));
+        cudaFree(d_subprob_to_bin);
+        return FINUFFT_ERR_CUDA_FAILURE;
+    }
+
     assert(d_subprob_to_bin != NULL);
     if (d_plan->subprob_to_bin != NULL)
         cudaFree(d_plan->subprob_to_bin);
@@ -519,35 +551,43 @@ int cuspread3d_subprob(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<T> *d_
     size_t sharedplanorysize = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) *
                                (bin_size_z + 2 * ceil(ns / 2.0)) * sizeof(cuda_complex<T>);
     if (sharedplanorysize > 49152) {
-        std::cout << "error: not enough shared memory (" << sharedplanorysize << ")" << std::endl;
-        return 1;
+        std::cerr << "[cuspread3d_subprob] error: not enough shared memory (" << sharedplanorysize << ")" << std::endl;
+        return FINUFFT_ERR_INSUFFICIENT_SHMEM;
     }
 
     for (int t = 0; t < blksize; t++) {
         if (d_plan->opts.gpu_kerevalmeth) {
-            spread_3d_subprob_horner<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, d_binstartpts,
-                d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts, d_numsubprob,
-                maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            spread_3d_subprob<T, 1><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
+                d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
+                d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         } else {
-            spread_3d_subprob<<<totalnumsubprob, 256, sharedplanorysize>>>(
-                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, es_c, es_beta,
+            spread_3d_subprob<T, 0><<<totalnumsubprob, 256, sharedplanorysize>>>(
+                d_kx, d_ky, d_kz, d_c + t * M, d_fw + t * nf1 * nf2 * nf3, M, ns, nf1, nf2, nf3, sigma, es_c, es_beta,
                 d_binstartpts, d_binsize, bin_size_x, bin_size_y, bin_size_z, d_subprob_to_bin, d_subprobstartpts,
                 d_numsubprob, maxsubprobsize, numbins[0], numbins[1], numbins[2], d_idxnupts, pirange);
+            RETURN_IF_CUDA_ERROR
         }
     }
 
     return 0;
 }
 
+template int cufinufft_spread3d<float>(int nf1, int nf2, int nf3, cuda_complex<float> *d_fw, int M, float *d_kx,
+    float *d_ky, float *d_kz, cuda_complex<float> *d_c,
+    cufinufft_plan_t<float> *d_plan);
+template int cufinufft_spread3d<double>(int nf1, int nf2, int nf3, cuda_complex<double> *d_fw, int M, double *d_kx,
+     double *d_ky, double *d_kz, cuda_complex<double> *d_c,
+     cufinufft_plan_t<double> *d_plan);
 template int cuspread3d<float>(cufinufft_plan_t<float> *d_plan, int blksize);
 template int cuspread3d<double>(cufinufft_plan_t<double> *d_plan, int blksize);
+template int cuspread3d_nuptsdriven_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_nuptsdriven_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_subprob_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_subprob_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
+template int cuspread3d_blockgather_prop<float>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<float> *d_plan);
+template int cuspread3d_blockgather_prop<double>(int nf1, int nf2, int nf3, int M, cufinufft_plan_t<double> *d_plan);
 
-template int cufinufft_spread3d<float>(int nf1, int nf2, int nf3, cuda_complex<float> *d_fw, int M, float *d_kx,
-                                       float *d_ky, float *d_kz, cuda_complex<float> *d_c,
-                                       cufinufft_plan_t<float> *d_plan);
-template int cufinufft_spread3d<double>(int nf1, int nf2, int nf3, cuda_complex<double> *d_fw, int M, double *d_kx,
-                                        double *d_ky, double *d_kz, cuda_complex<double> *d_c,
-                                        cufinufft_plan_t<double> *d_plan);
 } // namespace spreadinterp
 } // namespace cufinufft
diff --git a/src/cuda/3d/spreadinterp3d.cuh b/src/cuda/3d/spreadinterp3d.cuh
index 7475bfae6..80930b4be 100644
--- a/src/cuda/3d/spreadinterp3d.cuh
+++ b/src/cuda/3d/spreadinterp3d.cuh
@@ -2,7 +2,7 @@
 #include <iostream>
 
 #include <cuda.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/defs.h>
 #include <cufinufft/precision_independent.h>
@@ -71,11 +71,10 @@ __global__ void calc_inverse_of_global_sort_index_3d(int M, int bin_size_x, int
 }
 
 /* Kernels for NUptsdriven method */
-
-template <typename T>
-__global__ void spread_3d_nupts_driven_horner(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                              cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T sigma,
-                                              const int *idxnupts, int pirange) {
+template <typename T, int KEREVALMETH>
+__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
+                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
+                                       T sigma, const int *idxnupts, int pirange) {
     int xx, yy, zz, ix, iy, iz;
     int outidx;
     T ker1[MAX_NSPREAD];
@@ -101,59 +100,16 @@ __global__ void spread_3d_nupts_driven_horner(const T *x, const T *y, const T *z
         T y1 = (T)ystart - y_rescaled;
         T z1 = (T)zstart - z_rescaled;
 
-        eval_kernel_vec_horner(ker1, x1, ns, sigma);
-        eval_kernel_vec_horner(ker2, y1, ns, sigma);
-        eval_kernel_vec_horner(ker3, z1, ns, sigma);
-        for (zz = zstart; zz <= zend; zz++) {
-            ker3val = ker3[zz - zstart];
-            for (yy = ystart; yy <= yend; yy++) {
-                ker2val = ker2[yy - ystart];
-                for (xx = xstart; xx <= xend; xx++) {
-                    ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                    iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                    iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
-                    outidx = ix + iy * nf1 + iz * nf1 * nf2;
-                    ker1val = ker1[xx - xstart];
-                    T kervalue = ker1val * ker2val * ker3val;
-                    atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue);
-                    atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue);
-                }
-            }
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, x1, ns, sigma);
+            eval_kernel_vec_horner(ker2, y1, ns, sigma);
+            eval_kernel_vec_horner(ker3, z1, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
+            eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
         }
-    }
-}
-
-template <typename T>
-__global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                       cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
-                                       const int *idxnupts, int pirange) {
-    int xx, yy, zz, ix, iy, iz;
-    int outidx;
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-
-    T x_rescaled, y_rescaled, z_rescaled;
-    T ker1val, ker2val, ker3val;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
-        z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange);
-
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int zstart = ceil(z_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        int zend = floor(z_rescaled + ns / 2.0);
-
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        T z1 = (T)zstart - z_rescaled;
 
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
         for (zz = zstart; zz <= zend; zz++) {
             ker3val = ker3[zz - zstart];
             for (yy = ystart; yy <= yend; yy++) {
@@ -163,10 +119,8 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const
                     iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
                     iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
                     outidx = ix + iy * nf1 + iz * nf1 * nf2;
-
                     ker1val = ker1[xx - xstart];
                     T kervalue = ker1val * ker2val * ker3val;
-
                     atomicAdd(&fw[outidx].x, c[idxnupts[i]].x * kervalue);
                     atomicAdd(&fw[outidx].y, c[idxnupts[i]].y * kervalue);
                 }
@@ -176,26 +130,23 @@ __global__ void spread_3d_nupts_driven(const T *x, const T *y, const T *z, const
 }
 
 /* Kernels for Subprob method */
-
-template <typename T>
-__global__ void spread_3d_subprob_horner(T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns,
-                                         int nf1, int nf2, int nf3, T sigma, int *binstartpts, int *bin_size,
-                                         int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin,
-                                         int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx,
-                                         int nbiny, int nbinz, int *idxnupts, int pirange) {
+template <typename T, int KEREVALMETH>
+__global__ void spread_3d_subprob(T *x, T *y, T *z, cuda_complex<T> *c, cuda_complex<T> *fw, int M, int ns, int nf1,
+                                  int nf2, int nf3, T sigma, T es_c, T es_beta, int *binstartpts, int *bin_size,
+                                  int bin_size_x, int bin_size_y, int bin_size_z, int *subprob_to_bin,
+                                  int *subprobstartpts, int *numsubprob, int maxsubprobsize, int nbinx, int nbiny,
+                                  int nbinz, int *idxnupts, int pirange) {
     extern __shared__ char sharedbuf[];
     cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
 
-    int xstart, ystart, xend, yend, zstart, zend;
-    int bidx = subprob_to_bin[blockIdx.x];
-    int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
-    int ix, iy, iz, outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
+    const int bidx = subprob_to_bin[blockIdx.x];
+    const int binsubp_idx = blockIdx.x - subprobstartpts[bidx];
+    const int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
+    const int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
 
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
-    int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
+    const int xoffset = (bidx % nbinx) * bin_size_x;
+    const int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
+    const int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
 
     int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
 
@@ -204,185 +155,85 @@ __global__ void spread_3d_subprob_horner(T *x, T *y, T *z, cuda_complex<T> *c, c
         fwshared[i].y = 0.0;
     }
     __syncthreads();
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
 
     for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
         T ker1[MAX_NSPREAD];
         T ker2[MAX_NSPREAD];
         T ker3[MAX_NSPREAD];
 
-        int nuptsidx = idxnupts[ptstart + i];
-        x_rescaled = RESCALE(x[nuptsidx], nf1, pirange);
-        y_rescaled = RESCALE(y[nuptsidx], nf2, pirange);
-        z_rescaled = RESCALE(z[nuptsidx], nf3, pirange);
-        cnow = c[nuptsidx];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
-
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
-
-        for (int zz = zstart; zz <= zend; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            iz = zz + ceil(ns / 2.0);
-            if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0)
-                break;
-            for (int yy = ystart; yy <= yend; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                iy = yy + ceil(ns / 2.0);
-                if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
-                    break;
-                for (int xx = xstart; xx <= xend; xx++) {
-                    ix = xx + ceil(ns / 2.0);
-                    if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
-                        break;
-                    outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
-                             iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-                    T kervalue1 = ker1[xx - xstart];
-                    atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3);
-                    atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3);
-                }
-            }
-        }
-    }
-    __syncthreads();
-    /* write to global memory */
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
-        int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
-
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        iz = zoffset - ceil(ns / 2.0) + k;
-
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
-            outidx = ix + iy * nf1 + iz * nf1 * nf2;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
-                            k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-            atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
-            atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
+        const int nuptsidx = idxnupts[ptstart + i];
+        const T x_rescaled = RESCALE(x[nuptsidx], nf1, pirange);
+        const T y_rescaled = RESCALE(y[nuptsidx], nf2, pirange);
+        const T z_rescaled = RESCALE(z[nuptsidx], nf3, pirange);
+        cuda_complex<T> cnow = c[nuptsidx];
+
+        const int xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
+        const int ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
+        const int zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
+
+        const int xend = floor(x_rescaled + ns / 2.0) - xoffset;
+        const int yend = floor(y_rescaled + ns / 2.0) - yoffset;
+        const int zend = floor(z_rescaled + ns / 2.0) - zoffset;
+
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
         }
-    }
-}
-
-template <typename T>
-__global__ void spread_3d_subprob(const T *x, const T *y, const T *z, const cuda_complex<T> *c, cuda_complex<T> *fw,
-                                  int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, const int *binstartpts,
-                                  const int *bin_size, int bin_size_x, int bin_size_y, int bin_size_z,
-                                  const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
-                                  int maxsubprobsize, int nbinx, int nbiny, int nbinz, int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend, zstart, zend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy, iz, outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
-    int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        T ker1[MAX_NSPREAD];
-        T ker2[MAX_NSPREAD];
-        T ker3[MAX_NSPREAD];
-        int idx = ptstart + i;
-        x_rescaled = RESCALE(x[idxnupts[idx]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[idx]], nf2, pirange);
-        z_rescaled = RESCALE(z[idxnupts[idx]], nf3, pirange);
-        cnow = c[idxnupts[idx]];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
 
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-        T z1 = (T)zstart + zoffset - z_rescaled;
-
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
-#if 1
         for (int zz = zstart; zz <= zend; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            iz = zz + ceil(ns / 2.0);
+            const T kervalue3 = ker3[zz - zstart];
+            const int iz = zz + ceil(ns / 2.0);
             if (iz >= (bin_size_z + (int)ceil(ns / 2.0) * 2) || iz < 0)
                 break;
             for (int yy = ystart; yy <= yend; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                iy = yy + ceil(ns / 2.0);
+                const T kervalue2 = ker2[yy - ystart];
+                const int iy = yy + ceil(ns / 2.0);
                 if (iy >= (bin_size_y + (int)ceil(ns / 2.0) * 2) || iy < 0)
                     break;
                 for (int xx = xstart; xx <= xend; xx++) {
-                    T kervalue1 = ker1[xx - xstart];
-                    ix = xx + ceil(ns / 2.0);
+                    const int ix = xx + ceil(ns / 2.0);
                     if (ix >= (bin_size_x + (int)ceil(ns / 2.0) * 2) || ix < 0)
                         break;
-                    outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
-                             iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-#if 1
+                    const int outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
+                                       iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
+                    const T kervalue1 = ker1[xx - xstart];
                     atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3);
                     atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3);
-#endif
                 }
             }
         }
-#endif
     }
     __syncthreads();
+
     /* write to global memory */
     for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
-        int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
+        const int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
+        const int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
+        const int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
+
+        int ix = xoffset - ceil(ns / 2.0) + i;
+        int iy = yoffset - ceil(ns / 2.0) + j;
+        int iz = zoffset - ceil(ns / 2.0) + k;
 
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        iz = zoffset - ceil(ns / 2.0) + k;
         if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) {
             ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
             iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
             iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
-            outidx = ix + iy * nf1 + iz * nf1 * nf2;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
-                            k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
+            const int outidx = ix + iy * nf1 + iz * nf1 * nf2;
+            const int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
+                                  k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
             atomicAdd(&fw[outidx].x, fwshared[sharedidx].x);
             atomicAdd(&fw[outidx].y, fwshared[sharedidx].y);
         }
     }
 }
 
-/* Kernels for Block BlockGather Method */
+/* Kernels for BlockGather Method */
 template <typename T>
 __global__ void locate_nupts_to_bins_ghost(int M, int bin_size_x, int bin_size_y, int bin_size_z, int nobinx,
                                            int nobiny, int nobinz, int binsperobinx, int binsperobiny, int binsperobinz,
@@ -436,114 +287,16 @@ __global__ void calc_inverse_of_global_sort_index_ghost(int M, int bin_size_x, i
     }
 }
 
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void spread_3d_block_gather(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
                                        cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta,
                                        T sigma, const int *binstartpts, int obin_size_x, int obin_size_y,
-                                       int obin_size_z, int binsperobin, const int *subprob_to_bin,
+                                       int obin_size_z, int binsperobin, int *subprob_to_bin,
                                        const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny,
                                        int nobinz, const int *idxnupts, int pirange) {
     extern __shared__ char sharedbuf[];
     cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
 
-    int xstart, ystart, zstart, xend, yend, zend;
-    int subpidx = blockIdx.x;
-    int obidx = subprob_to_bin[subpidx];
-    int bidx = obidx * binsperobin;
-
-    int obinsubp_idx = subpidx - subprobstartpts[obidx];
-    int ix, iy, iz;
-    int outidx;
-    int ptstart = binstartpts[bidx] + obinsubp_idx * maxsubprobsize;
-    int nupts =
-        min(maxsubprobsize, binstartpts[bidx + binsperobin] - binstartpts[bidx] - obinsubp_idx * maxsubprobsize);
-
-    int xoffset = (obidx % nobinx) * obin_size_x;
-    int yoffset = (obidx / nobinx) % nobiny * obin_size_y;
-    int zoffset = (obidx / (nobinx * nobiny)) * obin_size_z;
-
-    int N = obin_size_x * obin_size_y * obin_size_z;
-
-    for (int i = threadIdx.x; i < N; i += blockDim.x) {
-        fwshared[i].x = 0.0;
-        fwshared[i].y = 0.0;
-    }
-    __syncthreads();
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        int b = idxnupts[idx] / M;
-        int box[3];
-        for (int d = 0; d < 3; d++) {
-            box[d] = b % 3;
-            if (box[d] == 1)
-                box[d] = -1;
-            if (box[d] == 2)
-                box[d] = 1;
-            b = b / 3;
-        }
-        int ii = idxnupts[idx] % M;
-        x_rescaled = RESCALE(x[ii], nf1, pirange) + box[0] * nf1;
-        y_rescaled = RESCALE(y[ii], nf2, pirange) + box[1] * nf2;
-        z_rescaled = RESCALE(z[ii], nf3, pirange) + box[2] * nf3;
-        cnow = c[ii];
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        xstart = xstart < 0 ? 0 : xstart;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        ystart = ystart < 0 ? 0 : ystart;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-        zstart = zstart < 0 ? 0 : zstart;
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        xend = xend >= obin_size_x ? obin_size_x - 1 : xend;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        yend = yend >= obin_size_y ? obin_size_y - 1 : yend;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
-        zend = zend >= obin_size_z ? obin_size_z - 1 : zend;
-
-        for (int zz = zstart; zz <= zend; zz++) {
-            T disz = abs(z_rescaled - (zz + zoffset));
-            T kervalue3 = evaluate_kernel(disz, es_c, es_beta, ns);
-            for (int yy = ystart; yy <= yend; yy++) {
-                T disy = abs(y_rescaled - (yy + yoffset));
-                T kervalue2 = evaluate_kernel(disy, es_c, es_beta, ns);
-                for (int xx = xstart; xx <= xend; xx++) {
-                    outidx = xx + yy * obin_size_x + zz * obin_size_y * obin_size_x;
-                    T disx = abs(x_rescaled - (xx + xoffset));
-                    T kervalue1 = evaluate_kernel(disx, es_c, es_beta, ns);
-                    atomicAdd(&fwshared[outidx].x, cnow.x * kervalue1 * kervalue2 * kervalue3);
-                    atomicAdd(&fwshared[outidx].y, cnow.y * kervalue1 * kervalue2 * kervalue3);
-                }
-            }
-        }
-    }
-    __syncthreads();
-    /* write to global memory */
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % obin_size_x;
-        int j = (n / obin_size_x) % obin_size_y;
-        int k = n / (obin_size_x * obin_size_y);
-
-        ix = xoffset + i;
-        iy = yoffset + j;
-        iz = zoffset + k;
-        outidx = ix + iy * nf1 + iz * nf1 * nf2;
-        atomicAdd(&fw[outidx].x, fwshared[n].x);
-        atomicAdd(&fw[outidx].y, fwshared[n].y);
-    }
-}
-
-template <typename T>
-__global__ void spread_3d_block_gather_horner(const T *x, const T *y, const T *z, const cuda_complex<T> *c,
-                                              cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c,
-                                              T es_beta, T sigma, const int *binstartpts, int obin_size_x,
-                                              int obin_size_y, int obin_size_z, int binsperobin, int *subprob_to_bin,
-                                              const int *subprobstartpts, int maxsubprobsize, int nobinx, int nobiny,
-                                              int nobinz, const int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
     int xstart, ystart, zstart, xend, yend, zend;
     int xstartnew, ystartnew, zstartnew, xendnew, yendnew, zendnew;
     int subpidx = blockIdx.x;
@@ -600,9 +353,15 @@ __global__ void spread_3d_block_gather_horner(const T *x, const T *y, const T *z
         yend = floor(y_rescaled + ns / 2.0) - yoffset;
         zend = floor(z_rescaled + ns / 2.0) - zoffset;
 
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
+        }
 
         xstartnew = xstart < 0 ? 0 : xstart;
         ystartnew = ystart < 0 ? 0 : ystart;
@@ -642,61 +401,10 @@ __global__ void spread_3d_block_gather_horner(const T *x, const T *y, const T *z
 
 /* ---------------------- 3d Interpolation Kernels ---------------------------*/
 /* Kernels for NUptsdriven Method */
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void interp_3d_nupts_driven(const T *x, const T *y, const T *z, cuda_complex<T> *c,
                                        const cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T es_c,
-                                       T es_beta, const int *idxnupts, int pirange) {
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-
-    T ker1val, ker2val, ker3val;
-    for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
-        T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
-        T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
-        T z_rescaled = RESCALE(z[idxnupts[i]], nf3, pirange);
-        int xstart = ceil(x_rescaled - ns / 2.0);
-        int ystart = ceil(y_rescaled - ns / 2.0);
-        int zstart = ceil(z_rescaled - ns / 2.0);
-        int xend = floor(x_rescaled + ns / 2.0);
-        int yend = floor(y_rescaled + ns / 2.0);
-        int zend = floor(z_rescaled + ns / 2.0);
-        cuda_complex<T> cnow;
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-        T x1 = (T)xstart - x_rescaled;
-        T y1 = (T)ystart - y_rescaled;
-        T z1 = (T)zstart - z_rescaled;
-
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
-        for (int zz = zstart; zz <= zend; zz++) {
-            ker3val = ker3[zz - zstart];
-            for (int yy = ystart; yy <= yend; yy++) {
-                ker2val = ker2[yy - ystart];
-                for (int xx = xstart; xx <= xend; xx++) {
-                    int ix = xx < 0 ? xx + nf1 : (xx > nf1 - 1 ? xx - nf1 : xx);
-                    int iy = yy < 0 ? yy + nf2 : (yy > nf2 - 1 ? yy - nf2 : yy);
-                    int iz = zz < 0 ? zz + nf3 : (zz > nf3 - 1 ? zz - nf3 : zz);
-
-                    int inidx = ix + iy * nf1 + iz * nf2 * nf1;
-
-                    ker1val = ker1[xx - xstart];
-                    cnow.x += fw[inidx].x * ker1val * ker2val * ker3val;
-                    cnow.y += fw[inidx].y * ker1val * ker2val * ker3val;
-                }
-            }
-        }
-        c[idxnupts[i]].x = cnow.x;
-        c[idxnupts[i]].y = cnow.y;
-    }
-}
-
-template <typename T>
-__global__ void interp_3d_nupts_driven_horner(const T *x, const T *y, const T *z, cuda_complex<T> *c,
-                                              const cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3,
-                                              T sigma, int *idxnupts, int pirange) {
+                                       T es_beta, T sigma, int *idxnupts, int pirange) {
     for (int i = blockDim.x * blockIdx.x + threadIdx.x; i < M; i += blockDim.x * gridDim.x) {
         T x_rescaled = RESCALE(x[idxnupts[i]], nf1, pirange);
         T y_rescaled = RESCALE(y[idxnupts[i]], nf2, pirange);
@@ -718,9 +426,15 @@ __global__ void interp_3d_nupts_driven_horner(const T *x, const T *y, const T *z
         T ker2[MAX_NSPREAD];
         T ker3[MAX_NSPREAD];
 
-        eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma);
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, xstart - x_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker2, ystart - y_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker3, zstart - z_rescaled, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, xstart - x_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, ystart - y_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker3, zstart - z_rescaled, ns, es_c, es_beta);
+        }
 
         for (int zz = zstart; zz <= zend; zz++) {
             T kervalue3 = ker3[zz - zstart];
@@ -743,13 +457,13 @@ __global__ void interp_3d_nupts_driven_horner(const T *x, const T *y, const T *z
 }
 
 /* Kernels for SubProb Method */
-template <typename T>
+template <typename T, int KEREVALMETH>
 __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_complex<T> *c, const cuda_complex<T> *fw,
-                                  int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, const int *binstartpts,
-                                  const int *bin_size, int bin_size_x, int bin_size_y, int bin_size_z,
-                                  const int *subprob_to_bin, const int *subprobstartpts, const int *numsubprob,
-                                  int maxsubprobsize, int nbinx, int nbiny, int nbinz, const int *idxnupts,
-                                  int pirange) {
+                                  int M, int ns, int nf1, int nf2, int nf3, T es_c, T es_beta, T sigma,
+                                  const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y,
+                                  int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts,
+                                  const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz,
+                                  const int *idxnupts, int pirange) {
     extern __shared__ char sharedbuf[];
     cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
 
@@ -768,7 +482,6 @@ __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_compl
 
     int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
 
-#if 1
     for (int n = threadIdx.x; n < N; n += blockDim.x) {
         int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
         int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
@@ -788,12 +501,10 @@ __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_compl
             fwshared[sharedidx].y = fw[outidx].y;
         }
     }
-#endif
     __syncthreads();
     T ker1[MAX_NSPREAD];
     T ker2[MAX_NSPREAD];
     T ker3[MAX_NSPREAD];
-
     T x_rescaled, y_rescaled, z_rescaled;
     cuda_complex<T> cnow;
     for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
@@ -803,6 +514,7 @@ __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_compl
         z_rescaled = RESCALE(z[idxnupts[idx]], nf3, pirange);
         cnow.x = 0.0;
         cnow.y = 0.0;
+
         xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
         ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
         zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
@@ -811,105 +523,16 @@ __global__ void interp_3d_subprob(const T *x, const T *y, const T *z, cuda_compl
         yend = floor(y_rescaled + ns / 2.0) - yoffset;
         zend = floor(z_rescaled + ns / 2.0) - zoffset;
 
-        T x1 = (T)xstart + xoffset - x_rescaled;
-        T y1 = (T)ystart + yoffset - y_rescaled;
-        T z1 = (T)zstart + zoffset - z_rescaled;
-
-        eval_kernel_vec(ker1, x1, ns, es_c, es_beta);
-        eval_kernel_vec(ker2, y1, ns, es_c, es_beta);
-        eval_kernel_vec(ker3, z1, ns, es_c, es_beta);
-
-        for (int zz = zstart; zz <= zend; zz++) {
-            T kervalue3 = ker3[zz - zstart];
-            iz = zz + ceil(ns / 2.0);
-            for (int yy = ystart; yy <= yend; yy++) {
-                T kervalue2 = ker2[yy - ystart];
-                iy = yy + ceil(ns / 2.0);
-                for (int xx = xstart; xx <= xend; xx++) {
-                    ix = xx + ceil(ns / 2.0);
-                    outidx = ix + iy * (bin_size_x + ceil(ns / 2.0) * 2) +
-                             iz * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-
-                    T kervalue1 = ker1[xx - xstart];
-                    cnow.x += fwshared[outidx].x * kervalue1 * kervalue2 * kervalue3;
-                    cnow.y += fwshared[outidx].y * kervalue1 * kervalue2 * kervalue3;
-                }
-            }
-        }
-        c[idxnupts[idx]].x = cnow.x;
-        c[idxnupts[idx]].y = cnow.y;
-    }
-}
-
-template <typename T>
-__global__ void interp_3d_subprob_horner(const T *x, const T *y, const T *z, cuda_complex<T> *c,
-                                         const cuda_complex<T> *fw, int M, int ns, int nf1, int nf2, int nf3, T sigma,
-                                         const int *binstartpts, const int *bin_size, int bin_size_x, int bin_size_y,
-                                         int bin_size_z, const int *subprob_to_bin, const int *subprobstartpts,
-                                         const int *numsubprob, int maxsubprobsize, int nbinx, int nbiny, int nbinz,
-                                         const int *idxnupts, int pirange) {
-    extern __shared__ char sharedbuf[];
-    cuda_complex<T> *fwshared = (cuda_complex<T> *)sharedbuf;
-
-    int xstart, ystart, xend, yend, zstart, zend;
-    int subpidx = blockIdx.x;
-    int bidx = subprob_to_bin[subpidx];
-    int binsubp_idx = subpidx - subprobstartpts[bidx];
-    int ix, iy, iz;
-    int outidx;
-    int ptstart = binstartpts[bidx] + binsubp_idx * maxsubprobsize;
-    int nupts = min(maxsubprobsize, bin_size[bidx] - binsubp_idx * maxsubprobsize);
-
-    int xoffset = (bidx % nbinx) * bin_size_x;
-    int yoffset = ((bidx / nbinx) % nbiny) * bin_size_y;
-    int zoffset = (bidx / (nbinx * nbiny)) * bin_size_z;
-
-    int N = (bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)) * (bin_size_z + 2 * ceil(ns / 2.0));
-
-    for (int n = threadIdx.x; n < N; n += blockDim.x) {
-        int i = n % (int)(bin_size_x + 2 * ceil(ns / 2.0));
-        int j = (int)(n / (bin_size_x + 2 * ceil(ns / 2.0))) % (int)(bin_size_y + 2 * ceil(ns / 2.0));
-        int k = n / ((bin_size_x + 2 * ceil(ns / 2.0)) * (bin_size_y + 2 * ceil(ns / 2.0)));
-
-        ix = xoffset - ceil(ns / 2.0) + i;
-        iy = yoffset - ceil(ns / 2.0) + j;
-        iz = zoffset - ceil(ns / 2.0) + k;
-        if (ix < (nf1 + ceil(ns / 2.0)) && iy < (nf2 + ceil(ns / 2.0)) && iz < (nf3 + ceil(ns / 2.0))) {
-            ix = ix < 0 ? ix + nf1 : (ix > nf1 - 1 ? ix - nf1 : ix);
-            iy = iy < 0 ? iy + nf2 : (iy > nf2 - 1 ? iy - nf2 : iy);
-            iz = iz < 0 ? iz + nf3 : (iz > nf3 - 1 ? iz - nf3 : iz);
-            outidx = ix + iy * nf1 + iz * nf1 * nf2;
-            int sharedidx = i + j * (bin_size_x + ceil(ns / 2.0) * 2) +
-                            k * (bin_size_x + ceil(ns / 2.0) * 2) * (bin_size_y + ceil(ns / 2.0) * 2);
-            fwshared[sharedidx].x = fw[outidx].x;
-            fwshared[sharedidx].y = fw[outidx].y;
+        if constexpr (KEREVALMETH == 1) {
+            eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
+            eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
+        } else {
+            eval_kernel_vec(ker1, xstart + xoffset - x_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker2, ystart + yoffset - y_rescaled, ns, es_c, es_beta);
+            eval_kernel_vec(ker3, zstart + zoffset - z_rescaled, ns, es_c, es_beta);
         }
-    }
-    __syncthreads();
-    T ker1[MAX_NSPREAD];
-    T ker2[MAX_NSPREAD];
-    T ker3[MAX_NSPREAD];
-    T x_rescaled, y_rescaled, z_rescaled;
-    cuda_complex<T> cnow;
-    for (int i = threadIdx.x; i < nupts; i += blockDim.x) {
-        int idx = ptstart + i;
-        x_rescaled = RESCALE(x[idxnupts[idx]], nf1, pirange);
-        y_rescaled = RESCALE(y[idxnupts[idx]], nf2, pirange);
-        z_rescaled = RESCALE(z[idxnupts[idx]], nf3, pirange);
-        cnow.x = 0.0;
-        cnow.y = 0.0;
-
-        xstart = ceil(x_rescaled - ns / 2.0) - xoffset;
-        ystart = ceil(y_rescaled - ns / 2.0) - yoffset;
-        zstart = ceil(z_rescaled - ns / 2.0) - zoffset;
-
-        xend = floor(x_rescaled + ns / 2.0) - xoffset;
-        yend = floor(y_rescaled + ns / 2.0) - yoffset;
-        zend = floor(z_rescaled + ns / 2.0) - zoffset;
 
-        eval_kernel_vec_horner(ker1, xstart + xoffset - x_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker2, ystart + yoffset - y_rescaled, ns, sigma);
-        eval_kernel_vec_horner(ker3, zstart + zoffset - z_rescaled, ns, sigma);
         for (int zz = zstart; zz <= zend; zz++) {
             T kervalue3 = ker3[zz - zstart];
             iz = zz + ceil(ns / 2.0);
diff --git a/src/cuda/CMakeLists.txt b/src/cuda/CMakeLists.txt
index e5e1d8d49..7b2479ddd 100644
--- a/src/cuda/CMakeLists.txt
+++ b/src/cuda/CMakeLists.txt
@@ -16,7 +16,6 @@ set(PRECISION_DEPENDENT_SRC
 set(CUFINUFFT_INCLUDE_DIRS
   ${PROJECT_SOURCE_DIR}/include
   ${PROJECT_SOURCE_DIR}/contrib
-  ${PROJECT_SOURCE_DIR}/include/cufinufft/contrib/cuda_samples
   $<TARGET_PROPERTY:CUDA::cudart,INTERFACE_INCLUDE_DIRECTORIES>
   $<TARGET_PROPERTY:CUDA::cufft,INTERFACE_INCLUDE_DIRECTORIES>
   $<TARGET_PROPERTY:CUDA::nvToolsExt,INTERFACE_INCLUDE_DIRECTORIES>
diff --git a/src/cuda/common.cu b/src/cuda/common.cu
index 11ed72ab8..30b8acef8 100644
--- a/src/cuda/common.cu
+++ b/src/cuda/common.cu
@@ -1,12 +1,14 @@
 #include <algorithm>
 #include <iomanip>
 #include <iostream>
+#include <limits>
 #include <vector>
 
 #include <cuComplex.h>
 #include <cuda.h>
 
 #include <cufinufft/common.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <cufinufft/defs.h>
 #include <cufinufft/precision_independent.h>
 #include <cufinufft/spreadinterp.h>
@@ -70,6 +72,8 @@ int cufserieskernelcompute(int dim, int nf1, int nf2, int nf3, T *d_f, cuDoubleC
 
     fseries_kernel_compute<<<numBlocks, threadsPerBlock>>>(nf1, nf2, nf3, d_f, d_a, d_fwkerhalf1, d_fwkerhalf2,
                                                            d_fwkerhalf3, ns);
+    RETURN_IF_CUDA_ERROR
+
     return 0;
 }
 
diff --git a/src/cuda/cufinufft.cu b/src/cuda/cufinufft.cu
index 87482a6a2..d14a593ff 100644
--- a/src/cuda/cufinufft.cu
+++ b/src/cuda/cufinufft.cu
@@ -6,11 +6,13 @@
 #include <cufinufft.h>
 #include <cufinufft/impl.h>
 
-inline bool is_invalid_mode_array(int dim, int64_t *modes64, int32_t modes32[3]) {
+inline bool is_invalid_mode_array(int dim, const int64_t *modes64, int32_t modes32[3]) {
     int64_t tot_size = 1;
     for (int i = 0; i < dim; ++i) {
         if (modes64[i] > std::numeric_limits<int32_t>::max())
             return true;
+        if (modes64[i] <= 0)
+            return true;
         modes32[i] = modes64[i];
         tot_size *= modes64[i];
     }
@@ -21,21 +23,31 @@ inline bool is_invalid_mode_array(int dim, int64_t *modes64, int32_t modes32[3])
 }
 
 extern "C" {
-int cufinufftf_makeplan(int type, int dim, int64_t *nmodes, int iflag, int ntransf, float tol,
+int cufinufftf_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, float tol,
                         cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts) {
+    if (dim < 1 || dim > 3) {
+        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+        return FINUFFT_ERR_DIM_NOTVALID;
+    }
+
     int nmodes32[3];
     if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return ERR_NDATA_NOTVALID;
+        return FINUFFT_ERR_NDATA_NOTVALID;
 
     return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<float> **)d_plan_ptr,
                                    opts);
 }
 
-int cufinufft_makeplan(int type, int dim, int64_t *nmodes, int iflag, int ntransf, double tol,
+int cufinufft_makeplan(int type, int dim, const int64_t *nmodes, int iflag, int ntransf, double tol,
                        cufinufft_plan *d_plan_ptr, cufinufft_opts *opts) {
+    if (dim < 1 || dim > 3) {
+        fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim);
+        return FINUFFT_ERR_DIM_NOTVALID;
+    }
+
     int nmodes32[3];
     if (is_invalid_mode_array(dim, nmodes, nmodes32))
-        return ERR_NDATA_NOTVALID;
+        return FINUFFT_ERR_NDATA_NOTVALID;
 
     return cufinufft_makeplan_impl(type, dim, nmodes32, iflag, ntransf, tol, (cufinufft_plan_t<double> **)d_plan_ptr,
                                    opts);
@@ -79,7 +91,7 @@ int cufinufft_destroy(cufinufft_plan d_plan) {
     return cufinufft_destroy_impl<double>((cufinufft_plan_t<double> *)d_plan);
 }
 
-int cufinufft_default_opts(int type, int dim, cufinufft_opts *opts)
+void cufinufft_default_opts(cufinufft_opts *opts)
 /*
     Sets the default options in cufinufft_opts. This must be called
     before the user changes any options from default values.
@@ -96,7 +108,6 @@ int cufinufft_default_opts(int type, int dim, cufinufft_opts *opts)
     Melody Shih 07/25/19; Barnett 2/5/21.
 */
 {
-    int ier;
     opts->upsampfac = 2.0;
 
     /* following options are for gpu */
@@ -116,54 +127,11 @@ int cufinufft_default_opts(int type, int dim, cufinufft_opts *opts)
 
     opts->gpu_maxbatchsize = 0; // Heuristically set
 
-    switch (dim) {
-    case 1: {
-        opts->gpu_kerevalmeth = 0; // using exp(sqrt())
-        if (type == 1) {
-            opts->gpu_method = 2;
-        }
-        if (type == 2) {
-            opts->gpu_method = 1;
-        }
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
-            return ier;
-        }
-    } break;
-    case 2: {
-        opts->gpu_kerevalmeth = 0; // using exp(sqrt())
-        if (type == 1) {
-            opts->gpu_method = 2;
-        }
-        if (type == 2) {
-            opts->gpu_method = 1;
-        }
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
-            return ier;
-        }
-    } break;
-    case 3: {
-        opts->gpu_kerevalmeth = 0; // using exp(sqrt())
-        if (type == 1) {
-            opts->gpu_method = 2;
-        }
-        if (type == 2) {
-            opts->gpu_method = 1;
-        }
-        if (type == 3) {
-            std::cerr << "Not Implemented yet" << std::endl;
-            ier = 1;
-            return ier;
-        }
-    } break;
-    }
+    opts->gpu_kerevalmeth = 1; // Horner
+
+    opts->gpu_method = 0; // Auto method (2 for type 1, 2 for type 2).
 
     // By default, only use device 0
     opts->gpu_device_id = 0;
-
-    return 0;
 }
 }
diff --git a/src/cuda/deconvolve_wrapper.cu b/src/cuda/deconvolve_wrapper.cu
index 782d226c8..170c2bb0e 100644
--- a/src/cuda/deconvolve_wrapper.cu
+++ b/src/cuda/deconvolve_wrapper.cu
@@ -3,7 +3,7 @@
 
 #include <cuComplex.h>
 #include <cuda.h>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 
 #include <cufinufft/cudeconvolve.h>
 
diff --git a/src/cuda/memtransfer_wrapper.cu b/src/cuda/memtransfer_wrapper.cu
index 5afb2b538..4216af0db 100644
--- a/src/cuda/memtransfer_wrapper.cu
+++ b/src/cuda/memtransfer_wrapper.cu
@@ -1,10 +1,12 @@
-#include <cufinufft/types.h>
 #include <iomanip>
 #include <iostream>
 
 #include <cuComplex.h>
 #include <cufinufft/memtransfer.h>
-#include <helper_cuda.h>
+#include <cufinufft/types.h>
+#include <cufinufft/utils.h>
+
+#include <cufinufft/contrib/helper_cuda.h>
 
 namespace cufinufft {
 namespace memtransfer {
@@ -17,42 +19,49 @@ int allocgpumem1d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
 
+    int ier;
     int nf1 = d_plan->nf1;
     int maxbatchsize = d_plan->maxbatchsize;
 
-    d_plan->byte_now = 0;
     switch (d_plan->opts.gpu_method) {
     case 1: {
         if (d_plan->opts.gpu_sort) {
             int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins * sizeof(int)));
-            checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins * sizeof(int)));
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins * sizeof(int)))))
+                goto finalize;
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins * sizeof(int)))))
+                goto finalize;
         }
     } break;
     case 2: {
         int numbins = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numbins * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numbins + 1) * sizeof(int)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numbins * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numbins + 1) * sizeof(int)))))
+            goto finalize;
     } break;
     default:
         std::cerr << "err: invalid method " << std::endl;
     }
 
     if (!d_plan->opts.gpu_spreadinterponly) {
-        checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * sizeof(cuda_complex<T>)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)))))
+            goto finalize;
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
-    return 0;
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
+
+    return ier;
 }
 
 template <typename T>
@@ -63,84 +72,36 @@ int allocgpumem1d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 11/21/21
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+    int ier;
 
     int M = d_plan->M;
-
-    if (d_plan->sortidx) {
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        d_plan->sortidx = nullptr;
-    }
-    if (d_plan->idxnupts) {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        d_plan->idxnupts = nullptr;
-    }
+    CUDA_FREE_AND_NULL(d_plan->sortidx);
+    CUDA_FREE_AND_NULL(d_plan->idxnupts);
 
     switch (d_plan->opts.gpu_method) {
     case 1: {
-        if (d_plan->opts.gpu_sort)
-            checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
+        if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
     } break;
-    case 2:
-    case 3: {
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
+    case 2: {
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
     } break;
     default:
-        std::cerr << "err: invalid method" << std::endl;
+        std::cerr << "[allocgpumem1d_nupts] error: invalid method\n";
+        ier = FINUFFT_ERR_METHOD_NOTVALID;
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
-
-    return 0;
-}
-
-template <typename T>
-void freegpumemory1d(cufinufft_plan_t<T> *d_plan)
-/*
-    wrapper for freeing gpu memory.
-
-    Melody Shih 11/21/21
-*/
-{
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        checkCudaErrors(cudaFree(d_plan->fw));
-        checkCudaErrors(cudaFree(d_plan->fwkerhalf1));
-    }
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-            checkCudaErrors(cudaFree(d_plan->sortidx));
-            checkCudaErrors(cudaFree(d_plan->binsize));
-            checkCudaErrors(cudaFree(d_plan->binstartpts));
-        } else {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-        }
-    } break;
-    case 2: {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        checkCudaErrors(cudaFree(d_plan->numsubprob));
-        checkCudaErrors(cudaFree(d_plan->binsize));
-        checkCudaErrors(cudaFree(d_plan->binstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprobstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprob_to_bin));
-    } break;
-    }
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+    return ier;
 }
 
 template <typename T>
@@ -151,53 +112,56 @@ int allocgpumem2d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+    int ier;
 
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
     int maxbatchsize = d_plan->maxbatchsize;
 
-    d_plan->byte_now = 0;
     switch (d_plan->opts.gpu_method) {
     case 1: {
         if (d_plan->opts.gpu_sort) {
             int numbins[2];
             numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
             numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-            checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int)));
-            checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int)));
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int)))))
+                goto finalize;
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int)))))
+                goto finalize;
         }
     } break;
     case 2: {
-        int numbins[2];
+        int64_t numbins[2];
         numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
         numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-        checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numbins[0] * numbins[1] + 1) * sizeof(int)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numbins[0] * numbins[1] * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numbins[0] * numbins[1] + 1) * sizeof(int)))))
+            goto finalize;
     } break;
     default:
-        std::cerr << "err: invalid method " << std::endl;
+        std::cerr << "[allocgpumem2d_plan] error: invalid method\n";
     }
 
     if (!d_plan->opts.gpu_spreadinterponly) {
-        checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * nf2 * sizeof(cuda_complex<T>)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T)))))
+            goto finalize;
     }
 
-    cudaStream_t *streams = (cudaStream_t *)malloc(d_plan->opts.gpu_nstreams * sizeof(cudaStream_t));
-    for (int i = 0; i < d_plan->opts.gpu_nstreams; i++)
-        checkCudaErrors(cudaStreamCreate(&streams[i]));
-    d_plan->streams = streams;
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
-    return 0;
+    return ier;
 }
 
 template <typename T>
@@ -208,87 +172,36 @@ int allocgpumem2d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+    int ier;
 
-    int M = d_plan->M;
+    const int M = d_plan->M;
 
-    if (d_plan->sortidx) {
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        d_plan->sortidx = nullptr;
-    }
-    if (d_plan->idxnupts) {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        d_plan->idxnupts = nullptr;
-    }
+    CUDA_FREE_AND_NULL(d_plan->sortidx);
+    CUDA_FREE_AND_NULL(d_plan->idxnupts);
 
     switch (d_plan->opts.gpu_method) {
     case 1: {
-        if (d_plan->opts.gpu_sort)
-            checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
+        if (d_plan->opts.gpu_sort && (ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
     } break;
     case 2: {
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
     } break;
     default:
-        std::cerr << "err: invalid method" << std::endl;
+        std::cerr << "[allocgpumem2d_nupts] error: invalid method\n";
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
 
-    return 0;
-}
-
-template <typename T>
-void freegpumemory2d(cufinufft_plan_t<T> *d_plan)
-/*
-    wrapper for freeing gpu memory.
-
-    Melody Shih 07/25/19
-*/
-{
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        checkCudaErrors(cudaFree(d_plan->fw));
-        checkCudaErrors(cudaFree(d_plan->fwkerhalf1));
-        checkCudaErrors(cudaFree(d_plan->fwkerhalf2));
-    }
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-            checkCudaErrors(cudaFree(d_plan->sortidx));
-            checkCudaErrors(cudaFree(d_plan->binsize));
-            checkCudaErrors(cudaFree(d_plan->binstartpts));
-        } else {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-        }
-    } break;
-    case 2: {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        checkCudaErrors(cudaFree(d_plan->numsubprob));
-        checkCudaErrors(cudaFree(d_plan->binsize));
-        checkCudaErrors(cudaFree(d_plan->binstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprobstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprob_to_bin));
-    } break;
-    }
-
-    for (int i = 0; i < d_plan->opts.gpu_nstreams; i++)
-        checkCudaErrors(cudaStreamDestroy(d_plan->streams[i]));
-
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+    return ier;
 }
 
 template <typename T>
@@ -299,75 +212,83 @@ int allocgpumem3d_plan(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+    int ier;
 
     int nf1 = d_plan->nf1;
     int nf2 = d_plan->nf2;
     int nf3 = d_plan->nf3;
     int maxbatchsize = d_plan->maxbatchsize;
 
-    d_plan->byte_now = 0;
-
     switch (d_plan->opts.gpu_method) {
     case 1: {
         if (d_plan->opts.gpu_sort) {
-            int numbins[3];
-            numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-            numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-            numbins[2] = ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-            checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
-            checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
+            const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) *
+                                      ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                                      ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, nbins_tot * sizeof(int)))))
+                goto finalize;
+            if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, nbins_tot * sizeof(int)))))
+                goto finalize;
         }
     } break;
     case 2: {
-        int numbins[3];
-        numbins[0] = ceil((T)nf1 / d_plan->opts.gpu_binsizex);
-        numbins[1] = ceil((T)nf2 / d_plan->opts.gpu_binsizey);
-        numbins[2] = ceil((T)nf3 / d_plan->opts.gpu_binsizez);
-        checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binstartpts, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numbins[0] * numbins[1] * numbins[2] + 1) * sizeof(int)));
+        const int64_t nbins_tot = ceil((T)nf1 / d_plan->opts.gpu_binsizex) * ceil((T)nf2 / d_plan->opts.gpu_binsizey) *
+                                  ceil((T)nf3 / d_plan->opts.gpu_binsizez);
+
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->numsubprob, nbins_tot * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, nbins_tot * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, nbins_tot * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (nbins_tot + 1) * sizeof(int)))))
+            goto finalize;
     } break;
     case 4: {
-        int numobins[3], numbins[3];
-        int binsperobins[3];
-        numobins[0] = ceil((T)nf1 / d_plan->opts.gpu_obinsizex);
-        numobins[1] = ceil((T)nf2 / d_plan->opts.gpu_obinsizey);
-        numobins[2] = ceil((T)nf3 / d_plan->opts.gpu_obinsizez);
-
-        binsperobins[0] = d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex;
-        binsperobins[1] = d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey;
-        binsperobins[2] = d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez;
-
-        numbins[0] = numobins[0] * (binsperobins[0] + 2);
-        numbins[1] = numobins[1] * (binsperobins[1] + 2);
-        numbins[2] = numobins[2] * (binsperobins[2] + 2);
-
-        checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numobins[0] * numobins[1] * numobins[2] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins[0] * numbins[1] * numbins[2] * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->binstartpts, (numbins[0] * numbins[1] * numbins[2] + 1) * sizeof(int)));
-        checkCudaErrors(
-            cudaMalloc(&d_plan->subprobstartpts, (numobins[0] * numobins[1] * numobins[2] + 1) * sizeof(int)));
+        const int numobins[3] = {(int)ceil((T)nf1 / d_plan->opts.gpu_obinsizex),
+                                 (int)ceil((T)nf2 / d_plan->opts.gpu_obinsizey),
+                                 (int)ceil((T)nf3 / d_plan->opts.gpu_obinsizez)};
+
+        const int binsperobins[3] = {d_plan->opts.gpu_obinsizex / d_plan->opts.gpu_binsizex,
+                                     d_plan->opts.gpu_obinsizey / d_plan->opts.gpu_binsizey,
+                                     d_plan->opts.gpu_obinsizez / d_plan->opts.gpu_binsizez};
+
+        const int numbins[3] = {numobins[0] * (binsperobins[0] + 2), numobins[1] * (binsperobins[1] + 2),
+                                numobins[2] * (binsperobins[2] + 2)};
+
+        const int64_t numobins_tot = numobins[0] * numobins[1] * numobins[2];
+        const int64_t numbins_tot = numbins[0] * numbins[1] * numbins[2];
+
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->numsubprob, numobins_tot * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binsize, numbins_tot * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->binstartpts, (numbins_tot + 1) * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->subprobstartpts, (numobins_tot + 1) * sizeof(int)))))
+            goto finalize;
     } break;
     default:
-        std::cerr << "err: invalid method" << std::endl;
+        std::cerr << "[allocgpumem3d_plan] error: invalid method\n";
     }
 
     if (!d_plan->opts.gpu_spreadinterponly) {
-        checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T)));
-        checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fw, maxbatchsize * nf1 * nf2 * nf3 * sizeof(cuda_complex<T>)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf1, (nf1 / 2 + 1) * sizeof(T)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf2, (nf2 / 2 + 1) * sizeof(T)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->fwkerhalf3, (nf3 / 2 + 1) * sizeof(T)))))
+            goto finalize;
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
 
-    return 0;
+    return ier;
 }
 
 template <typename T>
@@ -378,125 +299,86 @@ int allocgpumem3d_nupts(cufinufft_plan_t<T> *d_plan)
     Melody Shih 07/25/19
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+    int ier;
     int M = d_plan->M;
 
-    d_plan->byte_now = 0;
-
-    if (d_plan->sortidx) {
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        d_plan->sortidx = nullptr;
-    }
-    if (d_plan->idxnupts) {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        d_plan->idxnupts = nullptr;
-    }
+    CUDA_FREE_AND_NULL(d_plan->sortidx);
+    CUDA_FREE_AND_NULL(d_plan->idxnupts)
 
     switch (d_plan->opts.gpu_method) {
     case 1: {
-        if (d_plan->opts.gpu_sort)
-            checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
+        if (d_plan->opts.gpu_sort && ((ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int))))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
     } break;
     case 2: {
-        checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)));
-        checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->idxnupts, M * sizeof(int)))))
+            goto finalize;
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
     } break;
     case 4: {
-        checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)));
+        if ((ier = checkCudaErrors(cudaMalloc(&d_plan->sortidx, M * sizeof(int)))))
+            goto finalize;
     } break;
     default:
-        std::cerr << "err: invalid method" << std::endl;
+        std::cerr << "[allocgpumem3d_nupts] error: invalid method\n";
     }
 
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+finalize:
+    if (ier)
+        freegpumemory(d_plan);
 
-    return 0;
+    return ier;
 }
 
 template <typename T>
-void freegpumemory3d(cufinufft_plan_t<T> *d_plan)
+void freegpumemory(cufinufft_plan_t<T> *d_plan)
 /*
     wrapper for freeing gpu memory.
 
-    Melody Shih 07/25/19
+    Melody Shih 11/21/21
 */
 {
-    // Mult-GPU support: set the CUDA Device ID:
-    int orig_gpu_device_id;
-    cudaGetDevice(&orig_gpu_device_id);
-    cudaSetDevice(d_plan->opts.gpu_device_id);
-
-    if (!d_plan->opts.gpu_spreadinterponly) {
-        cudaFree(d_plan->fw);
-        cudaFree(d_plan->fwkerhalf1);
-        cudaFree(d_plan->fwkerhalf2);
-        cudaFree(d_plan->fwkerhalf3);
-    }
-
-    switch (d_plan->opts.gpu_method) {
-    case 1: {
-        if (d_plan->opts.gpu_sort) {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-            checkCudaErrors(cudaFree(d_plan->sortidx));
-            checkCudaErrors(cudaFree(d_plan->binsize));
-            checkCudaErrors(cudaFree(d_plan->binstartpts));
-        } else {
-            checkCudaErrors(cudaFree(d_plan->idxnupts));
-        }
-    } break;
-    case 2: {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        checkCudaErrors(cudaFree(d_plan->numsubprob));
-        checkCudaErrors(cudaFree(d_plan->binsize));
-        checkCudaErrors(cudaFree(d_plan->binstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprobstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprob_to_bin));
-    } break;
-    case 4: {
-        checkCudaErrors(cudaFree(d_plan->idxnupts));
-        checkCudaErrors(cudaFree(d_plan->sortidx));
-        checkCudaErrors(cudaFree(d_plan->numsubprob));
-        checkCudaErrors(cudaFree(d_plan->binsize));
-        checkCudaErrors(cudaFree(d_plan->binstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprobstartpts));
-        checkCudaErrors(cudaFree(d_plan->subprob_to_bin));
-    } break;
-    }
-
-    for (int i = 0; i < d_plan->opts.gpu_nstreams; i++)
-        checkCudaErrors(cudaStreamDestroy(d_plan->streams[i]));
-
-    // Multi-GPU support: reset the device ID
-    cudaSetDevice(orig_gpu_device_id);
+    utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
+    CUDA_FREE_AND_NULL(d_plan->fw);
+    CUDA_FREE_AND_NULL(d_plan->fw);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf1);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf2);
+    CUDA_FREE_AND_NULL(d_plan->fwkerhalf3);
+
+    CUDA_FREE_AND_NULL(d_plan->idxnupts);
+    CUDA_FREE_AND_NULL(d_plan->sortidx);
+    CUDA_FREE_AND_NULL(d_plan->numsubprob);
+    CUDA_FREE_AND_NULL(d_plan->binsize);
+    CUDA_FREE_AND_NULL(d_plan->binstartpts);
+    CUDA_FREE_AND_NULL(d_plan->subprob_to_bin);
+    CUDA_FREE_AND_NULL(d_plan->subprobstartpts);
+
+    CUDA_FREE_AND_NULL(d_plan->numnupts);
+    CUDA_FREE_AND_NULL(d_plan->numsubprob);
 }
 
 template int allocgpumem1d_plan<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem1d_plan<double>(cufinufft_plan_t<double> *d_plan);
 template int allocgpumem1d_nupts<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem1d_nupts<double>(cufinufft_plan_t<double> *d_plan);
-template void freegpumemory1d<float>(cufinufft_plan_t<float> *d_plan);
-template void freegpumemory1d<double>(cufinufft_plan_t<double> *d_plan);
+
+template void freegpumemory<float>(cufinufft_plan_t<float> *d_plan);
+template void freegpumemory<double>(cufinufft_plan_t<double> *d_plan);
 
 template int allocgpumem2d_plan<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem2d_plan<double>(cufinufft_plan_t<double> *d_plan);
 template int allocgpumem2d_nupts<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem2d_nupts<double>(cufinufft_plan_t<double> *d_plan);
-template void freegpumemory2d<float>(cufinufft_plan_t<float> *d_plan);
-template void freegpumemory2d<double>(cufinufft_plan_t<double> *d_plan);
 
 template int allocgpumem3d_plan<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem3d_plan<double>(cufinufft_plan_t<double> *d_plan);
 template int allocgpumem3d_nupts<float>(cufinufft_plan_t<float> *d_plan);
 template int allocgpumem3d_nupts<double>(cufinufft_plan_t<double> *d_plan);
-template void freegpumemory3d<float>(cufinufft_plan_t<float> *d_plan);
-template void freegpumemory3d<double>(cufinufft_plan_t<double> *d_plan);
 
 } // namespace memtransfer
 } // namespace cufinufft
diff --git a/src/cuda/spreadinterp.cpp b/src/cuda/spreadinterp.cpp
index bedaea008..b8ab601b2 100644
--- a/src/cuda/spreadinterp.cpp
+++ b/src/cuda/spreadinterp.cpp
@@ -8,6 +8,8 @@
 #include <cufinufft/spreadinterp.h>
 #include <cufinufft/utils.h>
 
+#include <finufft_errors.h>
+
 namespace cufinufft {
 namespace spreadinterp {
 
@@ -22,18 +24,17 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
 {
     if (upsampfac != 2.0) { // nonstandard sigma
         if (kerevalmeth == 1) {
-            fprintf(stderr, "setup_spreader: nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",
-                    (double)upsampfac);
-            return HORNER_WRONG_BETA;
+            fprintf(stderr, "[%s] nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n", __func__,
+                    upsampfac);
+            return FINUFFT_ERR_HORNER_WRONG_BETA;
         }
         if (upsampfac <= 1.0) {
-            fprintf(stderr, "setup_spreader: error, upsampfac=%.3g is <=1.0\n", (double)upsampfac);
-            return ERR_UPSAMPFAC_TOO_SMALL;
+            fprintf(stderr, "[%s] error: upsampfac=%.3g is <=1.0\n", __func__, upsampfac);
+            return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
         }
         // calling routine must abort on above errors, since opts is garbage!
         if (upsampfac > 4.0)
-            fprintf(stderr, "setup_spreader: warning, upsampfac=%.3g is too large to be beneficial!\n",
-                    (double)upsampfac);
+            fprintf(stderr, "[%s] warning: upsampfac=%.3g is too large to be beneficial!\n", __func__, upsampfac);
     }
 
     // defaults... (user can change after this function called)
@@ -49,7 +50,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
         fprintf(stderr, "setup_spreader: warning, increasing tol=%.3g to eps_mach=%.3g.\n", (double)eps,
                 (double)EPSILON);
         eps = EPSILON;
-        ier = WARN_EPS_TOO_SMALL;
+        ier = FINUFFT_WARN_EPS_TOO_SMALL;
     }
 
     // Set kernel width w (aka ns) and ES kernel beta parameter, in opts...
@@ -61,7 +62,7 @@ int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmet
         fprintf(stderr, "%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",
                 __func__, upsampfac, (double)eps, ns, MAX_NSPREAD);
         ns = MAX_NSPREAD;
-        ier = WARN_EPS_TOO_SMALL;
+        ier = FINUFFT_WARN_EPS_TOO_SMALL;
     }
     opts.nspread = ns;
     opts.ES_halfwidth = (T)ns / 2; // constants to help ker eval (except Horner)
diff --git a/src/finufft.cpp b/src/finufft.cpp
index 32923eb74..696500b76 100644
--- a/src/finufft.cpp
+++ b/src/finufft.cpp
@@ -12,6 +12,7 @@
 #include <iostream>
 #include <iomanip>
 #include <math.h>
+#include <mutex>
 #include <stdio.h>
 #include <stdlib.h>
 #include <vector>
@@ -92,6 +93,10 @@ Design notes for guru interface implementation:
 namespace finufft {
   namespace common {
 
+  // Technically global state...
+  // Needs to be static to avoid name collision with SINGLE/DOUBLE
+  static std::mutex fftw_lock;
+
   // We macro because it has no FLT args but gets compiled for both prec's...
 #ifdef SINGLE
 #define SET_NF_TYPE12 set_nf_type12f
@@ -110,7 +115,7 @@ int SET_NF_TYPE12(BIGINT ms, finufft_opts opts, finufft_spread_opts spopts, BIGI
     return 0;
   } else {
     fprintf(stderr,"[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a malloc\n",__func__,(double)*nf,(double)MAX_NF);
-    return ERR_MAXNALLOC;
+    return FINUFFT_ERR_MAXNALLOC;
   }
 }
 
@@ -544,7 +549,6 @@ void FINUFFT_DEFAULT_OPTS(finufft_opts *o)
   // sphinx tag (don't remove): @defopts_end
 }
 
-
 // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP
 int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
                      int ntrans, FLT tol, FINUFFT_PLAN *pp, finufft_opts* opts)
@@ -570,15 +574,15 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
   
   if((type!=1)&&(type!=2)&&(type!=3)) {
     fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n",__func__,type);
-    return ERR_TYPE_NOTVALID;
+    return FINUFFT_ERR_TYPE_NOTVALID;
   }
   if((dim!=1)&&(dim!=2)&&(dim!=3)) {
     fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n",__func__,dim);
-    return ERR_DIM_NOTVALID;
+    return FINUFFT_ERR_DIM_NOTVALID;
   }
   if (ntrans<1) {
     fprintf(stderr,"[%s] ntrans (%d) should be at least 1.\n",__func__,ntrans);
-    return ERR_NTRANS_NOTVALID;
+    return FINUFFT_ERR_NTRANS_NOTVALID;
   }
   
   // get stuff from args...
@@ -606,7 +610,7 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
     p->opts.spread_thread=2;                // our auto choice
   if (p->opts.spread_thread!=1 && p->opts.spread_thread!=2) {
     fprintf(stderr,"[%s] illegal opts.spread_thread!\n",__func__);
-    return ERR_SPREAD_THREAD_NOTVALID;
+    return FINUFFT_ERR_SPREAD_THREAD_NOTVALID;
   }
 
   if (type!=3) {    // read in user Fourier mode array sizes...
@@ -645,17 +649,16 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
     int nthr_fft = nthr;    // give FFTW all threads (or use o.spread_thread?)
                             // Note: batchSize not used since might be only 1.
     // Now place FFTW initialization in a lock, courtesy of OMP. Makes FINUFFT
-    // thread-safe (can be called inside OMP) if -DFFTW_PLAN_SAFE used...
-#pragma omp critical
+    // thread-safe (can be called inside OMP)
     {
-      static bool did_fftw_init = 0;    // the only global state of FINUFFT
+      static bool did_fftw_init = false;    // the only global state of FINUFFT
+      std::lock_guard<std::mutex> lock(fftw_lock);
       if (!did_fftw_init) {
 	FFTW_INIT();            // setup FFTW global state; should only do once
 	FFTW_PLAN_TH(nthr_fft); // ditto
-	FFTW_PLAN_SF();         // if -DFFTW_PLAN_SAFE, make FFTW thread-safe
-	did_fftw_init = 1;      // insure other FINUFFT threads don't clash
+	did_fftw_init = true;   // ensure other FINUFFT threads don't clash
       }
-    } 
+    }
 
     p->spopts.spread_direction = type;
 
@@ -705,21 +708,25 @@ int FINUFFT_MAKEPLAN(int type, int dim, BIGINT* n_modes, int iflag,
     p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
       fprintf(stderr, "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
-      return ERR_MAXNALLOC;
+      return FINUFFT_ERR_MAXNALLOC;
     }
-    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize);    // the big workspace
+
+    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // the big workspace
     if (p->opts.debug) printf("[%s] fwBatch %.2fGB alloc:   \t%.3g s\n", __func__,(double)1E-09*sizeof(CPX)*p->nf*p->batchSize, timer.elapsedsec());
     if(!p->fwBatch) {      // we don't catch all such mallocs, just this big one
       fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n",__func__);
       free(p->phiHat1); free(p->phiHat2); free(p->phiHat3);
-      return ERR_ALLOC;
+      return FINUFFT_ERR_ALLOC;
     }
    
     timer.restart();            // plan the FFTW
     int *ns = GRIDSIZE_FOR_FFTW(p);
     // fftw_plan_many_dft args: rank, gridsize/dim, howmany, in, inembed, istride, idist, ot, onembed, ostride, odist, sign, flags 
-    p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch,
-         NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf, p->fftSign, p->opts.fftw);
+    {
+      std::lock_guard<std::mutex> lock(fftw_lock);
+      p->fftwPlan = FFTW_PLAN_MANY_DFT(dim, ns, p->batchSize, p->fwBatch, NULL, 1, p->nf, p->fwBatch, NULL, 1, p->nf,
+                                       p->fftSign, p->opts.fftw);
+    }
     if (p->opts.debug) printf("[%s] FFTW plan (mode %d, nthr=%d):\t%.3g s\n", __func__,p->opts.fftw, nthr_fft, timer.elapsedsec());
     delete []ns;
     
@@ -769,7 +776,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
     p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
     if (!p->sortIndices) {
       fprintf(stderr,"[%s] failed to allocate sortIndices!\n",__func__);
-      return ERR_SPREAD_ALLOC;
+      return FINUFFT_ERR_SPREAD_ALLOC;
     }
     p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, xj, yj, zj, p->spopts);
     if (p->opts.debug) printf("[%s] sort (didSort=%d):\t\t%.3g s\n", __func__,p->didSort, timer.elapsedsec());
@@ -817,26 +824,36 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
     p->nf = p->nf1*p->nf2*p->nf3;      // fine grid total number of points
     if (p->nf * p->batchSize > MAX_NF) {
       fprintf(stderr, "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n",__func__);
-      return ERR_MAXNALLOC;
+      return FINUFFT_ERR_MAXNALLOC;
     }
-    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize);    // maybe big workspace
+    if (p->fwBatch)
+      FFTW_FR(p->fwBatch);
+    p->fwBatch = FFTW_ALLOC_CPX(p->nf * p->batchSize); // maybe big workspace
+
     // (note FFTW_ALLOC is not needed over malloc, but matches its type)
+    if(p->CpBatch) free(p->CpBatch);
     p->CpBatch = (CPX*)malloc(sizeof(CPX) * nj*p->batchSize);  // batch c' work
     if (p->opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09*sizeof(CPX)*(p->nf+nj)*p->batchSize, timer.elapsedsec());
     if(!p->fwBatch || !p->CpBatch) {
       fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n",__func__);
-      return ERR_ALLOC; 
+      return FINUFFT_ERR_ALLOC; 
     }
     //printf("fwbatch, cpbatch ptrs: %llx %llx\n",p->fwBatch,p->CpBatch);
 
     // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ...
+    if(p->X) free(p->X);
+    if(p->Sp) free(p->Sp);
     p->X = (FLT*)malloc(sizeof(FLT)*nj);
     p->Sp = (FLT*)malloc(sizeof(FLT)*nk);
     if (d>1) {
+      if(p->Y) free(p->Y);
+      if(p->Tp) free(p->Tp);
       p->Y = (FLT*)malloc(sizeof(FLT)*nj);
       p->Tp = (FLT*)malloc(sizeof(FLT)*nk);
     }
     if (d>2) {
+      if(p->Z) free(p->Z);
+      if(p->Up) free(p->Up);
       p->Z = (FLT*)malloc(sizeof(FLT)*nj);
       p->Up = (FLT*)malloc(sizeof(FLT)*nk);
     }
@@ -858,6 +875,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
 
     // set up prephase array...
     CPX imasign = (p->fftSign>=0) ? IMA : -IMA;             // +-i
+    if(p->prephase) free(p->prephase);
     p->prephase = (CPX*)malloc(sizeof(CPX)*nj);
     if (p->t3P.D1!=0.0 || p->t3P.D2!=0.0 || p->t3P.D3!=0.0) {
 #pragma omp parallel for num_threads(p->opts.nthreads) schedule(static)
@@ -885,6 +903,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
     
     // (old STEP 3a) Compute deconvolution post-factors array (per targ pt)...
     // (exploits that FT separates because kernel is prod of 1D funcs)
+    if(p->deconv) free(p->deconv);
     p->deconv = (CPX*)malloc(sizeof(CPX)*nk);
     FLT *phiHatk1 = (FLT*)malloc(sizeof(FLT)*nk);  // don't confuse w/ p->phiHat
     onedim_nuft_kernel(nk, p->Sp, phiHatk1, p->spopts);         // fill phiHat1
@@ -927,7 +946,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
     p->sortIndices = (BIGINT *)malloc(sizeof(BIGINT)*p->nj);
     if (!p->sortIndices) {
       fprintf(stderr,"[%s t3] failed to allocate sortIndices!\n",__func__);
-      return ERR_SPREAD_ALLOC;
+      return FINUFFT_ERR_SPREAD_ALLOC;
     }
     p->didSort = indexSort(p->sortIndices, p->nf1, p->nf2, p->nf3, p->nj, p->X, p->Y, p->Z, p->spopts);
     if (p->opts.debug) printf("[%s t3] sort (didSort=%d):\t\t%.3g s\n",__func__, p->didSort, timer.elapsedsec());
@@ -941,6 +960,7 @@ int FINUFFT_SETPTS(FINUFFT_PLAN p, BIGINT nj, FLT* xj, FLT* yj, FLT* zj,
     t2opts.spread_debug = max(0,p->opts.spread_debug-1);
     t2opts.showwarn = 0;                          // so don't see warnings 2x
     // (...could vary other t2opts here?)
+    if(p->innerT2plan) FINUFFT_DESTROY(p->innerT2plan);
     int ier = FINUFFT_MAKEPLAN(2, d, t2nmodes, p->fftSign, p->batchSize, p->tol,
                                &p->innerT2plan, &t2opts);
     if (ier>1) {     // if merely warning, still proceed
@@ -1109,10 +1129,14 @@ int FINUFFT_DESTROY(FINUFFT_PLAN p)
 {
   if (!p)                // NULL ptr, so not a ptr to a plan, report error
     return 1;
-  FFTW_FR(p->fwBatch);   // free the big FFTW (or t3 spread) working array
+
+  FFTW_FR(p->fwBatch); // free the big FFTW (or t3 spread) working array
   free(p->sortIndices);
   if (p->type==1 || p->type==2) {
-    FFTW_DE(p->fftwPlan);
+    {
+      std::lock_guard<std::mutex> lock(fftw_lock);
+      FFTW_DE(p->fftwPlan);
+    }
     free(p->phiHat1);
     free(p->phiHat2);
     free(p->phiHat3);
diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp
index b34b7c2a5..8d6e34d96 100644
--- a/src/spreadinterp.cpp
+++ b/src/spreadinterp.cpp
@@ -153,7 +153,7 @@ int spreadinterp(
   BIGINT* sort_indices = (BIGINT*)malloc(sizeof(BIGINT)*M);
   if (!sort_indices) {
     fprintf(stderr,"%s failed to allocate sort_indices!\n",__func__);
-    return ERR_SPREAD_ALLOC;
+    return FINUFFT_ERR_SPREAD_ALLOC;
   }
   int did_sort = indexSort(sort_indices, N1, N2, N3, M, kx, ky, kz, opts);
   spreadinterpSorted(sort_indices, N1, N2, N3, data_uniform,
@@ -187,11 +187,11 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky,
   int minN = 2*opts.nspread;
   if (N1<minN || (N2>1 && N2<minN) || (N3>1 && N3<minN)) {
     fprintf(stderr,"%s error: one or more non-trivial box dims is less than 2.nspread!\n",__func__);
-    return ERR_SPREAD_BOX_SMALL;
+    return FINUFFT_ERR_SPREAD_BOX_SMALL;
   }
   if (opts.spread_direction!=1 && opts.spread_direction!=2) {
     fprintf(stderr,"%s error: opts.spread_direction must be 1 or 2!\n",__func__);
-    return ERR_SPREAD_DIR;
+    return FINUFFT_ERR_SPREAD_DIR;
   }
   int ndims = ndims_from_Ns(N1,N2,N3);
   
@@ -203,21 +203,21 @@ int spreadcheck(BIGINT N1, BIGINT N2, BIGINT N3, BIGINT M, FLT *kx, FLT *ky,
     for (BIGINT i=0; i<M; ++i) {
       if ((opts.pirange ? (abs(kx[i])>3.0*PI) : (kx[i]<-N1 || kx[i]>2*N1)) || !isfinite(kx[i])) {
         fprintf(stderr,"%s NU pt not in valid range (central three periods): kx[%lld]=%.16g, N1=%lld (pirange=%d)\n",__func__, (long long)i, kx[i], (long long)N1,opts.pirange);
-        return ERR_SPREAD_PTS_OUT_RANGE;
+        return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
       }
     }
     if (ndims>1)
       for (BIGINT i=0; i<M; ++i) {
         if ((opts.pirange ? (abs(ky[i])>3.0*PI) : (ky[i]<-N2 || ky[i]>2*N2)) || !isfinite(ky[i])) {
           fprintf(stderr,"%s NU pt not in valid range (central three periods): ky[%lld]=%.16g, N2=%lld (pirange=%d)\n",__func__, (long long)i, ky[i], (long long)N2,opts.pirange);
-          return ERR_SPREAD_PTS_OUT_RANGE;
+          return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
         }
       }
     if (ndims>2)
       for (BIGINT i=0; i<M; ++i) {
         if ((opts.pirange ? (abs(kz[i])>3.0*PI) : (kz[i]<-N3 || kz[i]>2*N3)) || !isfinite(kz[i])) {
           fprintf(stderr,"%s NU pt not in valid range (central three periods): kz[%lld]=%.16g, N3=%lld (pirange=%d)\n",__func__, (long long)i, kz[i], (long long)N3,opts.pirange);
-          return ERR_SPREAD_PTS_OUT_RANGE;
+          return FINUFFT_ERR_SPREAD_PTS_OUT_RANGE;
         }
       }
     if (opts.debug) printf("\tNU bnds check:\t\t%.3g s\n",timer.elapsedsec());
@@ -560,7 +560,7 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
    Must call this before any kernel evals done, otherwise segfault likely.
    Returns:
      0  : success
-     WARN_EPS_TOO_SMALL : requested eps cannot be achieved, but proceed with
+     FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be achieved, but proceed with
                           best possible eps
      otherwise : failure (see codes in defs.h); spreading must not proceed
    Barnett 2017. debug, loosened eps logic 6/14/20.
@@ -569,11 +569,11 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
   if (upsampfac!=2.0 && upsampfac!=1.25) {   // nonstandard sigma
     if (kerevalmeth==1) {
       fprintf(stderr,"FINUFFT setup_spreader: nonstandard upsampfac=%.3g cannot be handled by kerevalmeth=1\n",upsampfac);
-      return ERR_HORNER_WRONG_BETA;
+      return FINUFFT_ERR_HORNER_WRONG_BETA;
     }
     if (upsampfac<=1.0) {       // no digits would result
       fprintf(stderr,"FINUFFT setup_spreader: error, upsampfac=%.3g is <=1.0\n",upsampfac);
-      return ERR_UPSAMPFAC_TOO_SMALL;
+      return FINUFFT_ERR_UPSAMPFAC_TOO_SMALL;
     }
     // calling routine must abort on above errors, since opts is garbage!
     if (showwarn && upsampfac>4.0)
@@ -602,7 +602,7 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
     if (showwarn)
       fprintf(stderr,"%s warning: increasing tol=%.3g to eps_mach=%.3g.\n",__func__,(double)eps,(double)EPSILON);
     eps = EPSILON;              // only changes local copy (not any opts)
-    ier = WARN_EPS_TOO_SMALL;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
   if (upsampfac==2.0)           // standard sigma (see SISC paper)
     ns = std::ceil(-log10(eps/(FLT)10.0));          // 1 digit per power of 10
@@ -614,7 +614,7 @@ int setup_spreader(finufft_spread_opts &opts, FLT eps, double upsampfac,
       fprintf(stderr,"%s warning: at upsampfac=%.3g, tol=%.3g would need kernel width ns=%d; clipping to max %d.\n",__func__,
               upsampfac,(double)eps,ns,MAX_NSPREAD);
     ns = MAX_NSPREAD;
-    ier = WARN_EPS_TOO_SMALL;
+    ier = FINUFFT_WARN_EPS_TOO_SMALL;
   }
   opts.nspread = ns;
 
diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
index 400c04589..23b3346da 100644
--- a/test/cuda/CMakeLists.txt
+++ b/test/cuda/CMakeLists.txt
@@ -73,3 +73,4 @@ add_tests(float 1e-5 2e-4)
 add_tests(double 1e-12 1e-11)
 
 add_test(NAME cufinufft_public_api COMMAND public_api_test)
+add_test(NAME cufinufft_makeplan COMMAND test_makeplan)
diff --git a/test/cuda/cufinufft1d_test.cu b/test/cuda/cufinufft1d_test.cu
index ad0be2e6f..bb2d96758 100644
--- a/test/cuda/cufinufft1d_test.cu
+++ b/test/cuda/cufinufft1d_test.cu
@@ -1,6 +1,6 @@
 #include <cmath>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <random>
@@ -82,11 +82,8 @@ int run_test(int method, int type, int N1, int M, T tol, T checktol, int iflag)
 
     // Here we setup our own opts, for gpu_method.
     cufinufft_opts opts;
-    ier = cufinufft_default_opts(type, dim, &opts);
-    if (ier != 0) {
-        printf("err %d: cufinufft_default_opts\n", ier);
-        return ier;
-    }
+    cufinufft_default_opts(&opts);
+
     opts.gpu_method = method;
     opts.gpu_maxbatchsize = 1;
 
diff --git a/test/cuda/cufinufft2d1nupts_test.cu b/test/cuda/cufinufft2d1nupts_test.cu
index 8e866b6ec..409c42625 100644
--- a/test/cuda/cufinufft2d1nupts_test.cu
+++ b/test/cuda/cufinufft2d1nupts_test.cu
@@ -2,7 +2,7 @@
 #include <cmath>
 #include <complex>
 #include <cstdio>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <random>
@@ -93,11 +93,7 @@ int run_test(int method) {
 
     // Here we setup our own opts, for gpu_method.
     cufinufft_opts opts;
-    ier = cufinufft_default_opts(type, dim, &opts);
-    if (ier != 0) {
-        printf("err %d: cufinufft_default_opts\n", ier);
-        return ier;
-    }
+    cufinufft_default_opts(&opts);
 
     opts.gpu_method = method;
     opts.gpu_maxbatchsize = 1;
diff --git a/test/cuda/cufinufft2d_test.cu b/test/cuda/cufinufft2d_test.cu
index 1202dfb6f..371b44b2f 100644
--- a/test/cuda/cufinufft2d_test.cu
+++ b/test/cuda/cufinufft2d_test.cu
@@ -1,6 +1,6 @@
 #include <cmath>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <limits>
@@ -82,11 +82,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
 
     // Here we setup our own opts, for gpu_method.
     cufinufft_opts opts;
-    int ier = cufinufft_default_opts(type, dim, &opts);
-    if (ier != 0) {
-        printf("err %d: cufinufft_default_opts\n", ier);
-        return ier;
-    }
+    cufinufft_default_opts(&opts);
 
     opts.gpu_method = method;
     opts.gpu_maxbatchsize = 1;
@@ -94,7 +90,7 @@ int run_test(int method, int type, int N1, int N2, int M, T tol, T checktol, int
     int nmodes[3] = {N1, N2, 1};
     int ntransf = 1;
     cudaEventRecord(start);
-    ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
+    int ier = cufinufft_makeplan_impl(type, dim, nmodes, iflag, ntransf, tol, &dplan, &opts);
     if (ier != 0) {
         printf("err: cufinufft2d_plan\n");
         return ier;
diff --git a/test/cuda/cufinufft2dmany_test.cu b/test/cuda/cufinufft2dmany_test.cu
index 67c4d4177..9c984e728 100644
--- a/test/cuda/cufinufft2dmany_test.cu
+++ b/test/cuda/cufinufft2dmany_test.cu
@@ -1,6 +1,6 @@
 #include <cmath>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <limits>
@@ -86,11 +86,8 @@ int run_test(int method, int type, int N1, int N2, int ntransf, int maxbatchsize
 
     // Here we setup our own opts, for gpu_method.
     cufinufft_opts opts;
-    ier = cufinufft_default_opts(type, dim, &opts);
-    if (ier != 0) {
-        printf("err %d: cufinufft_default_opts\n", ier);
-        return ier;
-    }
+    cufinufft_default_opts(&opts);
+
     opts.gpu_method = method;
     opts.gpu_maxbatchsize = maxbatchsize;
 
diff --git a/test/cuda/cufinufft3d_test.cu b/test/cuda/cufinufft3d_test.cu
index df141a8fd..a882f6715 100644
--- a/test/cuda/cufinufft3d_test.cu
+++ b/test/cuda/cufinufft3d_test.cu
@@ -1,7 +1,7 @@
 #include <cmath>
 #include <complex.h>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 #include <random>
@@ -86,11 +86,8 @@ int run_test(int method, int type, int N1, int N2, int N3, int M, T tol, T check
 
     // Here we setup our own opts, for gpu_method and gpu_kerevalmeth.
     cufinufft_opts opts;
-    ier = cufinufft_default_opts(type, dim, &opts);
-    if (ier != 0) {
-        printf("err %d: cufinufft_default_opts\n", ier);
-        return ier;
-    }
+    cufinufft_default_opts(&opts);
+
     opts.gpu_method = method;
     opts.gpu_kerevalmeth = 1;
     opts.gpu_maxbatchsize = 1;
diff --git a/test/cuda/fseries_kernel_test.cu b/test/cuda/fseries_kernel_test.cu
index aa5ca1be6..e0d09be61 100644
--- a/test/cuda/fseries_kernel_test.cu
+++ b/test/cuda/fseries_kernel_test.cu
@@ -1,6 +1,6 @@
 #include <cmath>
 #include <complex>
-#include <helper_cuda.h>
+#include <cufinufft/contrib/helper_cuda.h>
 #include <iomanip>
 #include <iostream>
 
diff --git a/test/cuda/interp1d_test.cu b/test/cuda/interp1d_test.cu
deleted file mode 100644
index 677a98f95..000000000
--- a/test/cuda/interp1d_test.cu
+++ /dev/null
@@ -1,175 +0,0 @@
-#include <cmath>
-#include <complex>
-#include <helper_cuda.h>
-#include <iomanip>
-#include <iostream>
-#include <random>
-
-#include <cufinufft.h>
-
-#include <cufinufft/common.h>
-#include <cufinufft/spreadinterp.h>
-#include <cufinufft/types.h>
-#include <cufinufft/utils.h>
-
-using namespace cufinufft::common;
-using namespace cufinufft::spreadinterp;
-using namespace cufinufft::utils;
-
-template <typename T>
-int run_test(int method, int nupts_distribute, int nf1, int M, T tol, int kerevalmeth, int sort) {
-    using real_t = T;
-    using complex_t = cuda_complex<T>;
-
-    int ier;
-    std::cout << std::scientific << std::setprecision(3);
-
-    real_t *x;
-    complex_t *c, *fw;
-    cudaMallocHost(&x, M * sizeof(real_t));
-    cudaMallocHost(&c, M * sizeof(complex_t));
-    cudaMallocHost(&fw, nf1 * sizeof(complex_t));
-
-    real_t *d_x;
-    complex_t *d_c, *d_fw;
-    checkCudaErrors(cudaMalloc(&d_x, M * sizeof(real_t)));
-    checkCudaErrors(cudaMalloc(&d_c, M * sizeof(complex_t)));
-    checkCudaErrors(cudaMalloc(&d_fw, nf1 * sizeof(complex_t)));
-
-    int dim = 1;
-    cufinufft_plan_t<real_t> *dplan;
-    dplan = new cufinufft_plan_t<real_t>;
-    // Zero out your struct, (sets all pointers to NULL, crucial)
-    memset(dplan, 0, sizeof(*dplan));
-    ier = cufinufft_default_opts(2, dim, &(dplan->opts));
-    dplan->opts.gpu_method = method;
-    dplan->opts.gpu_maxsubprobsize = 1024;
-    dplan->opts.gpu_kerevalmeth = kerevalmeth;
-    dplan->opts.gpu_sort = sort;
-    dplan->opts.gpu_spreadinterponly = 1;
-    dplan->opts.gpu_binsizex = 1024; // binsize needs to be set here, since
-                                     // SETUP_BINSIZE() is not called in
-                                     // spread, interp only wrappers.
-    ier = setup_spreader_for_nufft(dplan->spopts, tol, dplan->opts);
-
-    std::default_random_engine eng(1);
-    std::uniform_real_distribution<real_t> dist01(0, 1);
-    std::uniform_real_distribution<real_t> dist11(-1, 1);
-    auto rand01 = [&eng, &dist01]() { return dist01(eng); };
-    auto randm11 = [&eng, &dist11]() { return dist11(eng); };
-
-    switch (nupts_distribute) {
-    case 0: // uniform
-    {
-        for (int i = 0; i < M; i++) {
-            x[i] = M_PI * randm11(); // x in [-pi,pi)
-        }
-    } break;
-    case 1: // concentrate on a small region
-    {
-        for (int i = 0; i < M; i++) {
-            x[i] = M_PI * rand01() / (nf1 * 2 / 32); // x in [-pi,pi)
-        }
-    } break;
-    }
-    for (int i = 0; i < nf1; i++) {
-        fw[i].x = 1.0;
-        fw[i].y = 0.0;
-    }
-
-    checkCudaErrors(cudaMemcpy(d_x, x, M * sizeof(real_t), cudaMemcpyHostToDevice));
-    checkCudaErrors(cudaMemcpy(d_fw, fw, nf1 * sizeof(complex_t), cudaMemcpyHostToDevice));
-
-    CNTime timer;
-    timer.restart();
-    ier = cufinufft::spreadinterp::cufinufft_interp1d(nf1, d_fw, M, d_x, d_c, dplan);
-    if (ier != 0) {
-        std::cout << "error: cnufftinterp2d" << std::endl;
-        return 0;
-    }
-    real_t t = timer.elapsedsec();
-    printf("[Method %d] %ld U pts to #%d NU pts in %.3g s (\t%.3g NU pts/s)\n", dplan->opts.gpu_method, nf1, M, t,
-           M / t);
-    checkCudaErrors(cudaMemcpy(c, d_c, M * sizeof(complex_t), cudaMemcpyDeviceToHost));
-
-    std::cout << "[result-input]" << std::endl;
-    for (int j = 0; j < std::min(20, M); j++) {
-        printf(" (%2.3g,%2.3g)", c[j].x, c[j].y);
-        std::cout << std::endl;
-    }
-    std::cout << std::endl;
-
-    free(dplan);
-    cudaFreeHost(x);
-    cudaFreeHost(c);
-    cudaFreeHost(fw);
-    cudaFree(d_x);
-    cudaFree(d_c);
-    cudaFree(d_fw);
-
-    return 0;
-}
-
-int main(int argc, char *argv[]) {
-    int nf1;
-    double upsampfac = 2.0;
-    int N1, M;
-    if (argc < 4) {
-        fprintf(stderr, "Usage: interp1d method nupts_distr nf1 [M [tol [kerevalmeth [sort]]]]\n"
-                        "Arguments:\n"
-                        "  method: One of\n"
-                        "    1: nupts driven\n"
-                        "  nupts_distr: The distribution of the points; one of\n"
-                        "    0: uniform, or\n"
-                        "    1: concentrated in a small region.\n"
-                        "  nf1: The size of the 2D array.\n"
-                        "  M: The number of non-uniform points (default nf1 / 2).\n"
-                        "  tol: NUFFT tolerance (default 1e-6).\n"
-                        "  kerevalmeth: Kernel evaluation method; one of\n"
-                        "     0: Exponential of square root (default), or\n"
-                        "     1: Horner evaluation.\n"
-                        "  sort: One of\n"
-                        "     0: do not sort the points, or\n"
-                        "     1: sort the points (default).\n");
-        return 1;
-    }
-    double w;
-    int method;
-    sscanf(argv[1], "%d", &method);
-    int nupts_distribute;
-    sscanf(argv[2], "%d", &nupts_distribute);
-    sscanf(argv[3], "%lf", &w);
-    nf1 = (int)w; // so can read 1e6 right!
-
-    N1 = (int)nf1 / upsampfac;
-    M = N1; // let density always be 1
-    if (argc > 4) {
-        sscanf(argv[4], "%lf", &w);
-        M = (int)w; // so can read 1e6 right!
-        if (M == 0)
-            M = N1;
-    }
-
-    double tol = 1e-6;
-    if (argc > 5) {
-        sscanf(argv[5], "%lf", &w);
-        tol = w; // so can read 1e6 right!
-    }
-
-    int kerevalmeth = 0;
-    if (argc > 6) {
-        sscanf(argv[6], "%d", &kerevalmeth);
-    }
-
-    int sort = 1;
-    if (argc > 7) {
-        sscanf(argv[7], "%d", &sort);
-    }
-
-    printf("float test\n");
-    run_test<float>(method, nupts_distribute, nf1, M, tol, kerevalmeth, sort);
-    printf("double test\n");
-    run_test<double>(method, nupts_distribute, nf1, M, tol, kerevalmeth, sort);
-
-    return 0;
-}
diff --git a/test/cuda/test_makeplan.c b/test/cuda/test_makeplan.c
new file mode 100644
index 000000000..cbec02915
--- /dev/null
+++ b/test/cuda/test_makeplan.c
@@ -0,0 +1,196 @@
+#include "finufft_errors.h"
+#ifdef NDEBUG
+#undef NDEBUG
+#include <assert.h>
+#define NDEBUG
+#else
+#include <assert.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <cufinufft.h>
+#include <cuda_runtime.h>
+
+typedef struct {
+    char *p[2];
+} wasteful_pointers;
+
+// hackish way to make allocation failures happen
+wasteful_pointers alloc_remaining_device_mem() {
+    wasteful_pointers a = {NULL, NULL};
+    for (int i = 0; i < 2; ++i) {
+        size_t free, total;
+        cudaMemGetInfo(&free, &total);
+
+        int ier = 1;
+        int iter = 0;
+        while (ier && (iter < 60)) {
+            ier = cudaMalloc((void **)&a.p[i], free - (1 << iter));
+            iter++;
+        }
+    }
+
+    return a;
+}
+
+void free_wasteful_pointers(wasteful_pointers a) {
+    cudaFree(a.p[0]);
+    cudaFree(a.p[1]);
+}
+
+int main() {
+    cufinufftf_plan plan;
+    // defaults. tests should shadow them to override
+    const int dim = 1;
+    const int type = 1;
+    const int iflag = 1;
+    const float tol = 1e-5;
+    const int ntransf = 1;
+    const int64_t N[3] = {10, 20, 15};
+
+    // Dimension failure
+    {
+        const int dim = 0;
+        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_DIM_NOTVALID);
+    }
+
+    // 1D failure modes
+    {
+        const int dim = 1;
+        const int type = 1;
+        const int ntransf = 1;
+
+        // nice input should succeed
+        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+        cufinufftf_destroy(plan);
+
+        // Ignore higher dims, even if invalid
+        {
+            int64_t N[3] = {10, 0, 15};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+            cufinufftf_destroy(plan);
+        }
+
+        {
+            int64_t N[3] = {0, 20, 15};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        // cufinufft can't handle arrays bigger than INT32_MAX (cufft limitation)
+        {
+            int64_t N[3] = {(int64_t)INT32_MAX + 1, 1, 1};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        {
+            const int ntransf = 0;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
+        }
+
+        {
+            const int type = 4;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
+        }
+        
+        /* { */
+        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+        /*     int64_t N[3] = {INT32_MAX, 1, 1}; */
+        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE); */
+        /*     free_wasteful_pointers(p); */
+        /* } */
+    }
+
+    {
+        const int dim = 2;
+        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+        cufinufftf_destroy(plan);
+
+        {
+            int64_t N[3] = {10, 0, 1};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        // FIXME: nf calculation overflows -- need to handle upsampling mode calculation properly
+        /* { */
+        /*     int64_t N[3] = {INT32_MAX / 2, 2, 1}; */
+        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0); */
+        /* } */
+
+        {
+            int64_t N[3] = {INT32_MAX, 2, 1};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        {
+            const int type = 4;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
+        }
+
+        {
+            const int ntransf = 0;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
+        }
+
+        {
+            cufinufft_opts opts;
+            cufinufft_default_opts(&opts);
+            opts.upsampfac = 0.9;
+            opts.gpu_kerevalmeth = 1;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) == FINUFFT_ERR_HORNER_WRONG_BETA);
+
+            opts.gpu_kerevalmeth = 0;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) ==
+                   FINUFFT_ERR_UPSAMPFAC_TOO_SMALL);
+
+            // Should produce a warning, not an error
+            opts.upsampfac = 4.5;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, &opts) == 0);
+            cufinufftf_destroy(plan);
+        }
+
+        // This technique to cause cuda failures works most of the time, but sometimes would
+        // break following calls and could cause issues with other contexts using the same GPU
+        /* { */
+        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+        /*     int64_t N[3] = {sqrt(INT32_MAX - 1), sqrt(INT32_MAX) - 1, 1}; */
+        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE); */
+        /*     free_wasteful_pointers(p); */
+        /* } */
+    }
+
+    {
+        const int dim = 3;
+        assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == 0);
+        cufinufftf_destroy(plan);
+
+        {
+            int64_t N[3] = {10, 15, 0};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        {
+            int64_t N[3] = {INT32_MAX / 2, 2, 2};
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NDATA_NOTVALID);
+        }
+
+        {
+            const int type = 4;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_TYPE_NOTVALID);
+        }
+
+        {
+            const int ntransf = 0;
+            assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_NTRANS_NOTVALID);
+        }
+
+        /* { */
+        /*     wasteful_pointers p = alloc_remaining_device_mem(); */
+        /*     int64_t N[3] = {pow(INT32_MAX - 1, 1.0 / 3), pow(INT32_MAX - 1, 1.0 / 3), pow(INT32_MAX - 1, 1.0 / 3)}; */
+        /*     assert(cufinufftf_makeplan(type, dim, N, iflag, ntransf, tol, &plan, NULL) == FINUFFT_ERR_CUDA_FAILURE); */
+        /*     free_wasteful_pointers(p); */
+        /* } */
+    }
+}
diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp
index a9d46c593..ba3a3f328 100644
--- a/test/dumbinputs.cpp
+++ b/test/dumbinputs.cpp
@@ -87,7 +87,7 @@ int main(int argc, char* argv[])
   // 111111111111111111111111111111111111111111111111111111111111111111111111
   printf("1D dumb cases.\n");
   int ier = FINUFFT1D1(M,x,c,+1,0,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d1 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -105,26 +105,26 @@ int main(int argc, char* argv[])
   FLT xsave = x[0];
   x[0] = 3*PI*(1 + 2*EPSILON);   // works in either prec, just outside valid
   ier = FINUFFT1D1(M,x,c,+1,acc,N,F,&opts);
-  if (ier != ERR_SPREAD_PTS_OUT_RANGE) {
+  if (ier != FINUFFT_ERR_SPREAD_PTS_OUT_RANGE) {
     printf("1d1 x>3pi:\twrong err code %d\n",ier);
     return 1;
   }
   x[0] = INFINITY;
   ier = FINUFFT1D1(M,x,c,+1,acc,N,F,&opts);
-  if (ier != ERR_SPREAD_PTS_OUT_RANGE) {
+  if (ier != FINUFFT_ERR_SPREAD_PTS_OUT_RANGE) {
     printf("1d1 x=Inf:\twrong err code %d\n",ier);
     return 1;
   }
   x[0] = NAN;
   ier = FINUFFT1D1(M,x,c,+1,acc,N,F,&opts);
-  if (ier != ERR_SPREAD_PTS_OUT_RANGE) {
+  if (ier != FINUFFT_ERR_SPREAD_PTS_OUT_RANGE) {
     printf("1d1 x=NaN:\twrong err code %d\n",ier);
     return 1;
   }
   x[0] = xsave;
   for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
   ier = FINUFFT1D2(M,x,c,+1,0,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d2 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -141,7 +141,7 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
   ier = FINUFFT1D3(M,x,c,+1,0,N,s,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d3 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -188,12 +188,12 @@ int main(int argc, char* argv[])
   CPX* Fm = (CPX*)malloc(sizeof(CPX)*NN*ndata);     // the biggest array
   for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // set cm for 1d1many
   ier = FINUFFT1D1MANY(0,M,x,cm,+1,0,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("1d1many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT1D1MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d1many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -210,12 +210,12 @@ int main(int argc, char* argv[])
   }
   for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set Fm for 1d2many
   ier = FINUFFT1D2MANY(0,M,x,cm,+1,0,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("1d2many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT1D2MANY(ndata,M,x,cm,+1,0,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d2many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -232,12 +232,12 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 1d3many
   ier = FINUFFT1D3MANY(0, M,x,cm,+1,acc,N,s,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("1d3many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT1D3MANY(ndata, M,x,cm,+1,0,N,s,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("1d3many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -279,7 +279,7 @@ int main(int argc, char* argv[])
   // 2222222222222222222222222222222222222222222222222222222222222222222222222
   printf("2D dumb cases.\n"); // (uses y=x, and t=s in type 3)
   ier = FINUFFT2D1(M,x,x,c,+1,0,N,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d1 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -306,7 +306,7 @@ int main(int argc, char* argv[])
   }
   for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // set F for t2
   ier = FINUFFT2D2(M,x,x,c,+1,0,N,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d2 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -335,7 +335,7 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M; ++j) c[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset c for t3
   ier = FINUFFT2D3(M,x,x,c,+1,0,N,s,s,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d3 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -364,12 +364,12 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M*ndata; ++j) cm[j] = sin((FLT)1.3*j) + IMA*cos((FLT)0.9*j); // reset cm for 2d1many
   ier = FINUFFT2D1MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("2d1many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT2D1MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d1many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -396,12 +396,12 @@ int main(int argc, char* argv[])
   }
   for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.7*k) + IMA*cos((FLT)0.3*k);  // reset Fm for t2
   ier = FINUFFT2D2MANY(0,M,x,x,cm,+1,0,N,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("2d2many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT2D2MANY(ndata,M,x,x,cm,+1,0,N,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d2many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -429,12 +429,12 @@ int main(int argc, char* argv[])
     return ier;
   }
   ier = FINUFFT2D3MANY(0,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("2d3many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT2D3MANY(ndata,M,x,x,cm,+1,0,N,s,s,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("2d3many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -464,7 +464,7 @@ int main(int argc, char* argv[])
   // 3333333333333333333333333333333333333333333333333333333333333333333333333
   printf("3D dumb cases.\n");    // z=y=x, and u=t=s in type 3
   ier = FINUFFT3D1(M,x,x,x,c,+1,0,N,N,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d1 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -491,7 +491,7 @@ int main(int argc, char* argv[])
   }
   for (int k=0; k<NN; ++k) F[k] = sin((FLT)0.8*k) - IMA*cos((FLT)0.3*k);  // set F for t2
   ier = FINUFFT3D2(M,x,x,x,c,+1,0,N,N,N,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d2 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -526,7 +526,7 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M; ++j) c[j] = sin((FLT)1.2*j) - IMA*cos((FLT)0.8*j); // reset c for t3
   ier = FINUFFT3D3(M,x,x,x,c,+1,0,N,s,s,s,F,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d3 tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -555,12 +555,12 @@ int main(int argc, char* argv[])
   }
   for (int j=0; j<M*ndata; ++j) cm[j] = sin(-(FLT)1.2*j) + IMA*cos((FLT)1.1*j); // reset cm for 3d1many
   ier = FINUFFT3D1MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("3d1many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT3D1MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d1many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -592,12 +592,12 @@ int main(int argc, char* argv[])
   }
   for (int k=0; k<NN*ndata; ++k) Fm[k] = sin((FLT)0.6*k) - IMA*cos((FLT)0.3*k);  // reset Fm for t2
   ier = FINUFFT3D2MANY(0,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("3d2many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT3D2MANY(ndata,M,x,x,x,cm,+1,0,N,N,N,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d2many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
@@ -631,12 +631,12 @@ int main(int argc, char* argv[])
     return ier;
   }
   ier = FINUFFT3D3MANY(0,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
-  if (ier != ERR_NTRANS_NOTVALID) {
+  if (ier != FINUFFT_ERR_NTRANS_NOTVALID) {
     printf("3d3many ndata=0:\twrong err code %d\n",ier);
     return 1;
   }
   ier = FINUFFT3D3MANY(ndata,M,x,x,x,cm,+1,0,N,s,s,s,Fm,&opts);
-  if (ier != WARN_EPS_TOO_SMALL) {
+  if (ier != FINUFFT_WARN_EPS_TOO_SMALL) {
     printf("3d3many tol=0:\twrong err code %d\n",ier);
     return 1;
   }
diff --git a/tools/cufinufft/build-library.sh b/tools/cufinufft/build-library.sh
new file mode 100755
index 000000000..ec01d0735
--- /dev/null
+++ b/tools/cufinufft/build-library.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+set -e -u -x
+
+rm -rf /io/build
+mkdir /io/build
+cd /io/build
+
+cmake -D FINUFFT_USE_CUDA=ON \
+      -D FINUFFT_USE_CPU=OFF \
+      -D FINUFFT_BUILD_TESTS=ON \
+      -D CMAKE_CUDA_ARCHITECTURES="35;50;60;70;75;80" \
+      -D CMAKE_CUDA_FLAGS="-Wno-deprecated-gpu-targets" \
+      -D BUILD_TESTING=ON \
+      ..
+
+make -j4
diff --git a/tools/cufinufft/build-wheels.sh b/tools/cufinufft/build-wheels.sh
index 890c15415..2a3764a6e 100755
--- a/tools/cufinufft/build-wheels.sh
+++ b/tools/cufinufft/build-wheels.sh
@@ -53,7 +53,7 @@ done
 for py_version in ${py_versions[@]}; do
     py_binary=$(get_python_binary ${py_version})
 
-    "${py_binary}/pip" install cufinufft -f /io/wheelhouse
+    "${py_binary}/pip" install --pre cufinufft -f /io/wheelhouse
     "${py_binary}/pip" install pytest
     "${py_binary}/pytest" /io/python/cufinufft/tests
 done
diff --git a/tools/cufinufft/distribution_helper.sh b/tools/cufinufft/distribution_helper.sh
index 901473dad..335ad0ca0 100755
--- a/tools/cufinufft/distribution_helper.sh
+++ b/tools/cufinufft/distribution_helper.sh
@@ -2,36 +2,49 @@
 
 # Helper Script For Building Wheels
 
-cufinufft_version=1.3
+cufinufft_version=2.2
 manylinux_version=manylinux2014
 cuda_version=11.0
 dockerhub=janden
 
 
-echo "# build the wheel"
+echo "# Build the docker image"
 docker build \
-    --file ci/docker/cuda${cuda_version}/Dockerfile-x86_64 \
+    --file tools/cufinufft/docker/cuda${cuda_version}/Dockerfile-x86_64 \
     --tag ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} .
 
 
-echo "# Run the container, invoking the build-wheels script to generate the wheels"
-docker run \
+echo "# Create the container and start it"
+docker create \
     --gpus all \
     --interactive \
     --tty \
     --volume $(pwd)/wheelhouse:/io/wheelhouse \
     --env PLAT=${manylinux_version}_x86_64 \
-    ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version} \
-    /io/ci/build-wheels.sh
+    --env LIBRARY_PATH="/io/build" \
+    --env LD_LIBRARY_PATH="/io/build" \
+    --name cufinufft \
+    ${dockerhub}/cufinufft-${cufinufft_version}-${manylinux_version}
+
+docker start cufinufft
+
+echo "# Copy the code and build the library"
+docker cp . cufinufft:/io
+docker exec cufinufft /io/tools/cufinufft/build-library.sh
+
+echo "# Build the wheels"
+docker exec cufinufft /io/tools/cufinufft/build-wheels.sh
+
+echo "# Shut down the container and remove it"
+docker stop cufinufft
+docker rm cufinufft
 
 echo "# Copy the wheels we care about to the dist folder"
 mkdir -p dist
 cp -v wheelhouse/cufinufft-${cufinufft_version}-cp3*${manylinux_version}* dist
 
-
 echo "The following steps should be performed manually for now.\n"
 
-
 echo "# Push to Test PyPI for review/testing"
 echo "#twine upload -r testpypi dist/*"
 echo
diff --git a/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64 b/tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64
similarity index 52%
rename from tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64
rename to tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64
index 3ac5b8561..63dde32e1 100644
--- a/tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64
+++ b/tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64
@@ -2,15 +2,7 @@ FROM quay.io/pypa/manylinux2014_x86_64
 LABEL maintainer "Joakim Andén"
 
 ENV CUDA_MAJOR 11
-ENV CUDA_MINOR 0
-
-ENV CUDART_VERSION 11.0.171
-ENV CUFFT_VERSION 10.2.1.245
-ENV CURAND_VERSION 10.2.1.245
-ENV NVPROF_VERSION 11.0.221
-ENV NVTX_VERSION 11.0.167
-ENV NVCC_VERSION 11.0.221
-
+ENV CUDA_MINOR 2
 ENV CUDA_DASH_VERSION ${CUDA_MAJOR}-${CUDA_MINOR}
 ENV CUDA_DOT_VERSION ${CUDA_MAJOR}.${CUDA_MINOR}
 
@@ -24,7 +16,7 @@ COPY tools/cufinufft/docker/cuda${CUDA_DOT_VERSION}/cuda.repo /etc/yum.repos.d/c
 
 # For libraries in the cuda-compat-* package: https://docs.nvidia.com/cuda/eula/index.html#attachment-a
 RUN yum install -y \
-        cuda-cudart-${CUDA_DASH_VERSION}-${CUDART_VERSION}-1 \
+        cuda-cudart-${CUDA_DASH_VERSION} \
         cuda-compat-${CUDA_DASH_VERSION} && \
     ln -s cuda-${CUDA_DOT_VERSION} /usr/local/cuda && \
     rm -rf /var/cache/yum/*
@@ -43,18 +35,16 @@ ENV NVIDIA_REQUIRE_CUDA "cuda>=${CUDA_DOT_VERSION} brand=tesla,driver>=418,drive
 
 # runtime
 RUN yum install -y \
-        libcufft-${CUDA_DASH_VERSION}-${CUFFT_VERSION}-1 \
-        libcurand-${CUDA_DASH_VERSION}-${CURAND_VERSION}-1 \
-        cuda-nvtx-${CUDA_DASH_VERSION}-${NVTX_VERSION}-1 && \
+        cuda-libraries-${CUDA_DASH_VERSION} \
+        cuda-nvtx-${CUDA_DASH_VERSION} && \
     rm -rf /var/cache/yum/*
 
 # devel
 RUN yum install -y \
-        cuda-cudart-devel-${CUDA_DASH_VERSION}-${CUDART_VERSION}-1 \
-        libcufft-devel-${CUDA_DASH_VERSION}-${CUFFT_VERSION}-1 \
-        libcurand-devel-${CUDA_DASH_VERSION}-${CURAND_VERSION}-1 \
-        cuda-nvprof-${CUDA_DASH_VERSION}-${NVPROF_VERSION}-1 \
-        cuda-nvcc-${CUDA_DASH_VERSION}-${NVCC_VERSION}-1 && \
+        cuda-cudart-devel-${CUDA_DASH_VERSION} \
+        cuda-libraries-devel-${CUDA_DASH_VERSION} \
+        cuda-nvprof-${CUDA_DASH_VERSION} \
+        cuda-nvcc-${CUDA_DASH_VERSION} && \
     rm -rf /var/cache/yum/*
 
 ENV LIBRARY_PATH /usr/local/cuda/lib64/stubs
@@ -67,41 +57,9 @@ RUN yum install -y \
         devtoolset-9-gcc-c++ && \
     rm -rf /var/cache/yum/*
 
-RUN scl enable devtoolset-9 -- g++ --version
+ENV PATH /opt/rh/devtoolset-9/root/usr/bin:${PATH}
 
 # finufft reqs
 RUN yum install -y \
         cmake && \
     rm -rf /var/cache/yum/*
-
-# Okay, so now we can begin cufinufft
-#
-# Only copy the C++/CUDA code (this way the cache is not invalidated by changes
-# to the Python code or to the docs).
-COPY src /io/src
-COPY fortran /io/fortran
-COPY include /io/include
-COPY test /io/test
-COPY perftest /io/perftest
-COPY examples /io/examples
-COPY cmake /io/cmake
-COPY contrib /io/contrib
-COPY CMakeLists.txt /io/
-
-# Set up cmake and build the code.
-RUN mkdir -p /io/build
-WORKDIR /io/build
-RUN scl enable devtoolset-9 -- cmake -D FINUFFT_USE_CUDA=ON -D FINUFFT_USE_CPU=OFF -D CMAKE_CUDA_ARCHITECTURES="35;50;60;70;75;80" -DBUILD_TESTING=ON -DFINUFFT_BUILD_TESTS=ON ..
-RUN scl enable devtoolset-9 -- make -j4
-
-WORKDIR /io
-
-# And we need to pack it in our LD path
-ENV LD_LIBRARY_PATH /io/build:${LD_LIBRARY_PATH}
-
-# Now copy the rest of the repository in.
-COPY python /io/python
-COPY docs /io/docs
-COPY tools /io/tools
-
-CMD ["/bin/bash"]
diff --git a/tools/cufinufft/docker/cuda11.0/README b/tools/cufinufft/docker/cuda11.2/README
similarity index 100%
rename from tools/cufinufft/docker/cuda11.0/README
rename to tools/cufinufft/docker/cuda11.2/README
diff --git a/tools/cufinufft/docker/cuda11.0/cuda.repo b/tools/cufinufft/docker/cuda11.2/cuda.repo
similarity index 100%
rename from tools/cufinufft/docker/cuda11.0/cuda.repo
rename to tools/cufinufft/docker/cuda11.2/cuda.repo
diff --git a/tools/finufft/build-wheels-linux.sh b/tools/finufft/build-wheels-linux.sh
index 7086f6ea3..638ae80f7 100755
--- a/tools/finufft/build-wheels-linux.sh
+++ b/tools/finufft/build-wheels-linux.sh
@@ -46,7 +46,9 @@ done
 
 # test wheel
 for pybin in "${pys[@]}"; do
-    "${pybin}/pip" install finufft -f ./python/finufft/wheelhouse/
+    "${pybin}/pip" install --pre finufft -f ./python/finufft/wheelhouse/
     "${pybin}/python" ./python/finufft/test/run_accuracy_tests.py
     "${pybin}/python" ./python/finufft/examples/simple1d1.py
+    "${pybin}/pip" install pytest
+    "${pybin}/pytest" python/finufft/test
 done