diff --git a/.github/workflows/python_build_win.ps1 b/.github/workflows/python_build_win.ps1
index fcb3d611f..072413d42 100644
--- a/.github/workflows/python_build_win.ps1
+++ b/.github/workflows/python_build_win.ps1
@@ -41,6 +41,8 @@ Copy-Item -Path C:\msys64\mingw64\bin\libgomp-*.dll -Destination ([IO.Path]::Com
Copy-Item -Path C:\msys64\mingw64\bin\libwinpthread-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
Copy-Item -Path C:\msys64\mingw64\bin\libfftw3-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
+Copy-Item -Path C:\msys64\mingw64\bin\libfftw3_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
+Copy-Item -Path C:\msys64\mingw64\bin\libfftw3f_omp-*.dll -Destination ([IO.Path]::Combine($unpacked_wheel, 'finufft'))
New-Item -Path .\wheelhouse -ItemType Directory -Force
wheel.exe pack $unpacked_wheel -d .\wheelhouse
if (-not $?) {throw "Failed pack wheel"}
diff --git a/.github/workflows/python_test_win.ps1 b/.github/workflows/python_test_win.ps1
index 3e77ef049..949a6906d 100644
--- a/.github/workflows/python_test_win.ps1
+++ b/.github/workflows/python_test_win.ps1
@@ -1,6 +1,9 @@
-python -m pip install finufft -f .\wheelhouse\
+python -m pip install --pre finufft -f .\wheelhouse\
if (-not $?) {throw "Failed to pip install finufft"}
python python/finufft/test/run_accuracy_tests.py
if (-not $?) {throw "Tests failed"}
python python/finufft/examples/simple1d1.py
if (-not $?) {throw "Simple1d1 test failed"}
+python -m pip install pytest
+python -m pytest python/finufft/test
+if (-not $?) {throw "Pytest suite failed"}
diff --git a/.github/workflows/python_wheel.yml b/.github/workflows/python_wheel.yml
index 095653cc7..fd0db91bb 100644
--- a/.github/workflows/python_wheel.yml
+++ b/.github/workflows/python_wheel.yml
@@ -34,7 +34,7 @@ jobs:
uses: actions/upload-artifact@v2
with:
name: linux-wheels
- path: python/wheelhouse/finufft*manylinux*.whl
+ path: python/finufft/wheelhouse/finufft*manylinux*.whl
MacOS:
runs-on: macos-latest
@@ -60,19 +60,6 @@ jobs:
# hack to make libquadmath link statically
sudo rm /usr/local/opt/gcc@11/lib/gcc/11/libquadmath.*dylib
- - name: Cache macOS Python binaries
- id: cache-python
- uses: actions/cache@v3
- with:
- path: |
- /Library/Frameworks/Python.framework/Versions/3.6
- /Library/Frameworks/Python.framework/Versions/3.7
- /Library/Frameworks/Python.framework/Versions/3.8
- /Library/Frameworks/Python.framework/Versions/3.9
- /Library/Frameworks/Python.framework/Versions/3.10
- /Library/Frameworks/Python.framework/Versions/3.11
- key: macos-python-3.6.8-macosx10.9-python3.7.9-macosx10.9-python3.8.3-macosx10.9-python3.9.7-macos11-python3.10.1-macos11-python3.11.0-macos11
-
# Download and install Python instead of using the setup_python
# as the python interpreters in the Github machines
# were compiled in 10.14, the wheels built with them
@@ -138,30 +125,42 @@ jobs:
PYTHON_BIN=/Library/Frameworks/Python.framework/Versions/3.11/bin/
$PYTHON_BIN/python3 -m pip install delocate
ls wheelhouse/finufft*.whl | xargs -n1 $PYTHON_BIN/delocate-wheel -w fixed_wheel/
- /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 examples/simple1d1.py
- /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.6/bin/python3 -m pytest test
+ /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 examples/simple1d1.py
- /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.7/bin/python3 -m pytest test
+ /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.8/bin/python3 examples/simple1d1.py
- /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.8/bin/python3 -m pytest test
+ /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 examples/simple1d1.py
- /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 -m pytest test
+ /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.10/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.10/bin/python3 examples/simple1d1.py
- /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install finufft -f fixed_wheel/
+ /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.10/bin/python3 -m pytest test
+ /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install --pre finufft -f fixed_wheel/
/Library/Frameworks/Python.framework/Versions/3.11/bin/python3 test/run_accuracy_tests.py
/Library/Frameworks/Python.framework/Versions/3.11/bin/python3 examples/simple1d1.py
+ /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pip install pytest
+ /Library/Frameworks/Python.framework/Versions/3.11/bin/python3 -m pytest test
- name: Upload wheels
uses: actions/upload-artifact@v2
with:
name: macos-wheels
- path: python/fixed_wheel/*.whl
+ path: python/finufft/fixed_wheel/*.whl
Windows:
runs-on: windows-latest
diff --git a/.readthedocs.yml b/.readthedocs.yaml
similarity index 79%
rename from .readthedocs.yml
rename to .readthedocs.yaml
index dc748b7c0..0f63d4f34 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yaml
@@ -1,12 +1,16 @@
# .readthedocs.yml
# "Read the Docs" doc-hosting website configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
-#
-# Barnett 10/5/20.
# Required (for this file format)
version: 2
+# Set the OS, Python version and other tools you might need
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.11"
+
# Build all formats
formats: all
@@ -16,6 +20,5 @@ sphinx:
# Optionally set the version of Python and requirements required to build your docs
python:
- version: 3.7
install:
- requirements: docs/requirements.txt
diff --git a/CHANGELOG b/CHANGELOG
index e2932f637..9aa0bf1c1 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,7 +1,12 @@
List of features / changes made / release notes, in reverse chronological order.
If not stated, FINUFFT is assumed (up to v 1.3 CUFINUFFT indicated separately).
-V 2.2.0 (not yet released... planning summer 2023):
+V 2.3.0 (expected Dec 2023)
+
+* finally thread-safety for all fftw cases, kill FFTW_PLAN_SAFE (PR 354)
+* python interface edge cases (singleton dims, #395). Awaiting #367
+
+V 2.2.0 (08/30/23) (untagged; aka 2.2.0.dev0, a name used to bump python pkg)
* MERGE OF CUFINUFFT (GPU CODE) INTO FINUFFT SOURCE TREE:
- combined cmake build system via FINUFFT_USE_CUDA flag
@@ -13,7 +18,7 @@ V 2.2.0 (not yet released... planning summer 2023):
- cufinufft repo will be obsoleted.
- coding leads on this: Robert Blackwell and Joakim Anden
* cmake build structure (thanks: Wenda Zhou, Marco Barbone, Libin Lu)
- - Note: the plan is to make the makefiles/make.inc.* obsolete by 2023 end.
+ - Note: the plan is to make the makefiles/make.inc.* obsolete.
* interp (for type 2) accel by up to 60% in high-acc 2D/3D, by FMA/SIMD-friendly
rearranging of loops, by Martin Reinecke, PR #292.
* remove inv array in binsort; speeds up multithreaded case by up to 50%
@@ -25,7 +30,7 @@ V 2.2.0 (not yet released... planning summer 2023):
remains public, as should be (PR #233). Allows plan to have C++ constructs.
* fixed single-thread (OMP=OFF) build which failed due to fftw_defs.h/defs.h
-CUFINUFFT v 1.3 (06/10/23)
+CUFINUFFT v 1.3 (06/10/23) (Final legacy release outside FINUFFT repo)
* Move second half of onedim_fseries_kernel() to GPU (with a simple heuristic
basing on nf1 to switch between the CPU and the GPU version).
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 311d4feaa..9e3ee23a7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.19)
-project(finufft VERSION 2.1.0 LANGUAGES C CXX)
+project(finufft VERSION 2.2.0 LANGUAGES C CXX)
set(GNU_LIKE_FRONTENDS AppleClang Clang GNU)
if(CMAKE_CXX_COMPILER_ID IN_LIST GNU_LIKE_FRONTENDS)
@@ -11,7 +11,12 @@ endif()
include(CTest)
-set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
+if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc|ppc64|powerpc|powerpc64" OR (APPLE AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc|ppc64"))
+ # PowerPC arch does not have -march flag.
+ set(FINUFFT_ARCH_FLAGS "-mtune=native" CACHE STRING "Compiler flags for specifying target architecture.")
+else()
+ set(FINUFFT_ARCH_FLAGS "-march=native" CACHE STRING "Compiler flags for specifying target architecture.")
+endif()
set(FINUFFT_FFTW_SUFFIX "OpenMP" CACHE STRING "Suffix for FFTW libraries (e.g. OpenMP, Threads etc.)")
set(FINUFFT_FFTW_LIBRARIES "DEFAULT" CACHE STRING "Specify a custom FFTW library")
diff --git a/CMakePresets.json b/CMakePresets.json
index 8acb6dc1c..2363692b1 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -55,6 +55,20 @@
"FINUFFT_USE_OPENMP": "OFF"
}
},
+ {
+ "name": "icx",
+ "binaryDir": "build/icx",
+ "displayName": "Intel Compiler (llvm)",
+ "description": "Build with Intel Compiler",
+ "generator": "Ninja Multi-Config",
+ "cacheVariables": {
+ "CMAKE_C_COMPILER": "icx",
+ "CMAKE_CXX_COMPILER": "icpx",
+ "CMAKE_Fortran_COMPILER": "ifx",
+ "FINUFFT_ARCH_FLAGS": "-xHost",
+ "CMAKE_CXX_FLAGS": "-fp-model=strict"
+ }
+ },
{
"name": "icc",
"binaryDir": "build/icc",
@@ -109,6 +123,11 @@
"configurePreset": "icc",
"configuration": "RelWithDebInfo"
},
+ {
+ "name": "icx",
+ "configurePreset": "icx",
+ "configuration": "RelWithDebInfo"
+ },
{
"name": "matlab",
"configurePreset": "matlab",
diff --git a/Jenkinsfile b/Jenkinsfile
index 89951e3ba..2dc8247c5 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -9,30 +9,53 @@ pipeline {
stage('main') {
agent {
dockerfile {
- filename 'tools/cufinufft/docker/cuda11.0/Dockerfile-x86_64'
+ filename 'tools/cufinufft/docker/cuda11.2/Dockerfile-x86_64'
args '--gpus 2'
+ label 'v100'
}
}
environment {
- HOME = "$WORKSPACE/build"
+ HOME = "$WORKSPACE"
PYBIN = "/opt/python/cp38-cp38/bin"
+ LIBRARY_PATH = "$WORKSPACE/build"
+ LD_LIBRARY_PATH = "$WORKSPACE/build"
}
steps {
sh '''#!/bin/bash -ex
nvidia-smi
'''
sh '''#!/bin/bash -ex
- cp -r /io/build/test/cuda cuda_tests
- cd cuda_tests
+ echo $HOME
+ '''
+ sh '''#!/bin/bash -ex
+ # v100 cuda arch
+ cuda_arch="70"
+
+ cmake -B build . -DFINUFFT_USE_CUDA=ON \
+ -DFINUFFT_USE_CPU=OFF \
+ -DFINUFFT_BUILD_TESTS=ON \
+ -DCMAKE_CUDA_ARCHITECTURES="$cuda_arch" \
+ -DBUILD_TESTING=ON
+ cd build
+ make -j4
+ '''
+ sh '''#!/bin/bash -ex
+ cd build/test/cuda
ctest --output-on-failure
'''
sh '${PYBIN}/python3 -m venv $HOME'
sh '''#!/bin/bash -ex
source $HOME/bin/activate
python3 -m pip install --upgrade pip
- LIBRARY_PATH=/io/build python3 -m pip install -e python/cufinufft
+ python3 -m pip install --upgrade pycuda cupy-cuda112 numba
+ python3 -m pip install torch==1.7.1+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+ python3 -m pip install -e python/cufinufft
python3 -m pip install pytest
- python3 -m pytest
+ python -c "from numba import cuda; cuda.cudadrv.libs.test()"
+ python3 -m pytest --framework=pycuda python/cufinufft
+ python3 -m pytest --framework=numba python/cufinufft
+ python3 -m pytest --framework=cupy python/cufinufft
+ python3 -m pytest --framework=torch python/cufinufft
'''
}
}
diff --git a/README.md b/README.md
index 042fbea10..88f34d84f 100644
--- a/README.md
+++ b/README.md
@@ -3,20 +3,20 @@
Principal author **Alex H. Barnett**,
main co-developers Jeremy F. Magland,
-Ludvig af Klinteberg, Yu-hsuan "Melody" Shih, Andrea Malleo, Libin Lu,
-and Joakim Andén;
+Ludvig af Klinteberg, Yu-hsuan "Melody" Shih, Libin Lu,
+Joakim Andén, and Robert Blackwell;
see `docs/ackn.rst` for full list of contributors.
-This is a lightweight CPU library to compute the three standard types of nonuniform FFT to a specified precision, in one, two, or three dimensions. It is written in C++ with interfaces to C, Fortran, MATLAB/octave, Python, and (in a separate [repository](https://github.com/ludvigak/FINUFFT.jl)) Julia. It now contains the GPU CUDA library cuFINUFFT.
+This is a lightweight CPU library to compute the three standard types of nonuniform FFT to a specified precision, in one, two, or three dimensions. It is written in C++ with interfaces to C, Fortran, MATLAB/octave, Python, and (in a separate [repository](https://github.com/ludvigak/FINUFFT.jl)) Julia. It now also integrates the GPU CUDA library cuFINUFFT (which currently does all but type 3).
Please see the [online documentation](http://finufft.readthedocs.io/en/latest/index.html) which can also be downloaded as a [PDF manual](https://finufft.readthedocs.io/_/downloads/en/latest/pdf/).
-You will also want to see example codes in the directories
-`examples`, `test`, `fortran`, `matlab/test`, and `python/test`.
-If you cannot compile, or `pip install`, try our rather outdated [precompiled binaries](http://users.flatironinstitute.org/~ahb/codes/finufft-binaries).
+You will also want to see CPU example codes in the directories `examples`, `test`, `fortran`, `matlab/test`, `matlab/examples`, `python/finufft/test`, etc, and GPU examples in `examples/cuda`, `test/cuda`, etc
+
+If you cannot build via cMake, try the old makefile. Python users try `pip install finufft`. See the docs for details. See our GitHub Issues for tips.
If you prefer to read text files, the source to generate the above documentation is in human-readable (mostly .rst) files as follows:
@@ -29,22 +29,22 @@ If you prefer to read text files, the source to generate the above documentation
- `docs/c_gpu.rst` : documentation of C++/C function API for GPU library
- `docs/opts.rst` : optional parameters
- `docs/error.rst` : error codes
-- `docs/trouble.rst` : troubleshooting
+- `docs/trouble.rst` : troubleshooting advice
- `docs/tut.rst` and `docs/tutorial/*` : tutorial application examples
- `docs/fortran.rst` : usage examples from Fortran, documentation of interface
- `docs/matlab.rst` and `docs/matlabhelp.raw` : using the MATLAB/Octave interface
- `docs/python.rst` and `python/*/_interfaces.py` : using the Python interface
- `docs/python_gpu.rst` : Python interface to GPU library
-- `docs/julia.rst` : using the Julia interface
+- `docs/julia.rst` : options for Julia users
- `docs/devnotes.rst`: notes/guide for developers
- `docs/related.rst` : other recommended NUFFT packages
-- `docs/users.rst` : users of FINUFFT and dependent packages
+- `docs/users.rst` : some known users of FINUFFT, dependent packages
- `docs/ackn.rst` : authors and acknowledgments
-- `docs/refs.rst` : journal article references (ours and others)
+- `docs/refs.rst` : journal article references (both ours and others)
-If you find (cu)FINUFFT useful in your work, please cite this repository and
-the following. For FINUFFT (CPU library):
+If you find (cu)FINUFFT useful in your work, please star this repository and
+cite it and the following. For FINUFFT (CPU library):
A parallel non-uniform fast Fourier transform library based on an ``exponential of semicircle'' kernel.
A. H. Barnett, J. F. Magland, and L. af Klinteberg.
diff --git a/TODO b/devel/TODO
similarity index 100%
rename from TODO
rename to devel/TODO
diff --git a/devel/agenda_11-27-23.txt b/devel/agenda_11-27-23.txt
new file mode 100644
index 000000000..547e31406
--- /dev/null
+++ b/devel/agenda_11-27-23.txt
@@ -0,0 +1,138 @@
+Goals: make release with some ok and stable stuff in it
+
+
+Release (name = v2.3 or v2.2.1?) ------------------------------------------
+
+ - presumably a tag on GH, once done
+
+ - 2.2 = 2.2.0dev0 never tagged. Last release was 2.1 in June 2022.
+
+ - binaries available for users (requests eg #362 #285) -> place binaries for
+ 3 OSes somewhere on GH, eg "Releases" -> Assets. [ Can automate?]
+ Kill off my old binary webpage.
+
+
+CI ...........
+
+CI/jenkins/ branch -> failing suddenly 11/22 - why?
+
+CI status runs only CPU tests:
+ spreadtestall.sh (has no math checking, a small perf test)
+ basicpassfail{f} (exit code checks math)
+ check_finufft.sh (both prec; has math chk + dumbinputs chk)
+ CI doesn't run GPU!
+
+CI improvements:
+ run examples?
+ fortran / other langs ? (complicated in CI...)
+
+PR 254 - CI using cmake(ctest), builds on.
+ - can OMP_NUM_THREADS be set in platform-indep way in ctest add_test() ? no
+ DD says do in C++: https://github.com/flatironinstitute/finufft/pull/254
+ Much easier, but needs a new cmd arg in the test/*.cpp executables
+
+CI py had zero CPU time on windows -> div-by-0 error! (I fixed). why?
+
+
+library.....
+
+[ahb brought in PR 354 thread safety. Simplifies docs too.
+ - jax waiting on this
+ - ask Robert: unclear why some fftw_free or alloc are mutexed and not others?
+ - ahb tested w/ all make tasks.]
+
+GPU: PR 330: deal w/ cuda streams - difficult for ahb to test
+ - jax waiting on this
+ - multicoil MRI users have opinions
+
+remove obsolete bkw-compat (nufft_opts) ?
+
+
+wrappers....
+
+Libin to fix (1,M) vs (M,) Py issue #367
+
+[Libin did PR for #359, but not closed? I tested py and closed]
+
+pr 308 - no
+
+
+docs....
+
+RTD missing a Secret for webhook to tell GH; RTD won't give it :)
+I changed repo to FI/finufft from ahbarnett/finufft ... RTD webhook fails
+(code 400). Resync fails. Checked build (py 3.11) by hand is ok (1 min).
+ask Dylan.
+
+
+Notes for team......
+
+ We need to keep docs/* and CHANGELOG up to date when make changes!
+
+ who tests GPU code and adds more tests, CI?
+
+ Please close an issue after you fix it
+
+
+
+========= FUTURE DECISIONS =============
+
+* Build:
+ - keep makefile, yes, but slowly deprecate (eg Leslie/Alex rely on it) ?
+
+* CI:
+ complete rest of PR 254 for CI via cmake/ctest.
+
+* CPU:
+ * allow FFT switching (plus in cmake but not old makefile?)
+ - see Reinecke pr 287. Benchmarking needed for us, including batched FFTW.
+ - good: no fft plan needed.
+ - bad: installation complexity :( (makefile and CMake)
+
+* GPU:
+ * Type 3 in GPU.
+ * Paquiteau request for multi-coil MRI:
+ having multiples plans with shared memory (e.g. for the NUpts locations and others parameters) could be an alternative ?
+ https://github.com/flatironinstitute/finufft/pull/330
+ * GPU interfaces to py
+ * matlab par-toolbox gpuarray interfaces
+
+* explicit 1D slices for fftw, cuts 2d and 3d fft cost down (Reinecke idea).
+ - #304 has Robert's start in 2D. Also Reinecke has in #287.
+ - but: must work with batching too (batched 1d's with stride, ugh?).
+ - nontrivial, benchmark -> a couple of days solid work.
+
+* improved window shapes, more upsampfacs, like ducc0
+ - several days work
+
+* SIMD spread/interp/sort
+ - significant work, but big potential gains
+
+* Overall, consider joining XSDK via conforming to their good policies:
+https://github.com/xsdk-project/xsdk-community-policies
+Mandatory policies: we already conform to
+M2, M4 (need NERSC etc CMake presets...), M7, M9, M10, M11, M14, M17 (I think).
+I don't understand: M12.
+Not relevant: M3.
+Leaves to do:
+M1 - spack installation route
+M5 - reliable contact for dev team - easy
+M8 - runtime API to get version # - easy
+M13 - we don't really have a make install. But local locations correct.
+M16 - our DEBUG is runtime, not a build option. Need add -g in a cmake task.
+CI is not mentioned.
+
+* source code formatting?
+
+* see devel/TODO - and make this file current by thinking/killing issues there
+
+* Marco plan April through Aug 2024
+
+
+
+========= Random Questions ==========
+
+#569 ? onedim_fseries_kernel ... was not brought in
+
+Can I kill .travis.yml ? (3 yrs old - what's it for?)
+
diff --git a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
index 091b2a9d7..c80a53f08 100644
--- a/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
+++ b/devel/cuda/cufinufft_tasks_meeting_Jun2023.txt
@@ -52,9 +52,11 @@ SMALL TASKS IMMEDIATE:
* DOCS: (Alex will update RST docs for cuda, incl its PNG). DONE.
-INSTALL: add how to find SM_?? cuda arch [DONE]
-* test issue #276 re meth=1,2 - Alex will update
+* test issue #276 re meth=1,2 - Alex will update. DONE
+
+* cufinufft has C and C++ - J make issue [<- Alex forgot what this meant??]
+[was merely problem with headers in using gcc] - DONE.
-* cufinufft has C and C++ - J make issue <- Alex forgot what this meant??
* see if gpu error codes can be merged into cpu - the only new one
could be cudaMalloc failed.
diff --git a/devel/finufft_meeting-7-5-23.txt b/devel/finufft_meeting-7-5-23.txt
new file mode 100644
index 000000000..3e843d84c
--- /dev/null
+++ b/devel/finufft_meeting-7-5-23.txt
@@ -0,0 +1,73 @@
+
+Aiming for v2.2 release late-July 2023.
+(Save cufinufft t3 and FFT-sparsity exploits for v2.3, in Fall '23).
+
+
+TO DO:
+
+* cufinufft_default_opts needs to match finufft - remove dims and type
+Sets defaults: gpu_sort =1, gpu_method=0 (means auto-choose)
+
+gnu_binsize{x,y,z} = 0 amd change later default-setting to
+
+cufinufft_makeplan: don't change user opts struct,
+just change the deep copy sitting in the plan object.
+if gpu_meth = 0, auto-choose it (1,2,4).
+
+Rationalize naming of opts: (enums?)
+
+gpu_meth = 1 is GM (for type 1 or 2, and best for type 2 only)
+gpu_meth = 2 is SM for spread (best for type-1); gpu_sort then ignored (check?)
+gpu_meth = 4 is block-gather, for spread, only in 3d - experimental
+
+gpu_sort =0 : GM
+gpu_sort =1 : GM-sort (default)
+
+(Py internal _default_opts needs to be updated to match)
+
+
+* Joakim dtype Py deprecate with attribute error dtype=Float64
+ cufinufft.nufft simple interfaces (only for Py), add to docs, Joakim.
+
+
+* cufinufft tests:
+PR #303, merge.
+Maybe Alex test this.
+
+* gpu error codes merge them into cpu, add GPU ALLOC ERROR.
+Ie add GPU-specific err codes to l.42 include/finufft/defs.h
+Not have cufinufft give ier=1 randomly. Issue #284.
+Test for cuda kernel launch failures? we don't currently. Issue #273.
+
+* bring in as many small PRs as possible and close issues.
+
+
+
+LESS URGENT TO DO:
+
+* alex "answer" discussions.
+* Julia wrapper for gpu code - Ludvig?
+* matlab wrapper to gpu code - gpuArray.
+* py-project.TOML 3.12 obsoleting dist-utils. But can still use setup.py
+ [for 2024]
+* Exploit sparsity of 2d/3d FFT a la Reinecke suggestion, and Robert demo.
+ Esp. important for cuFFT GPU version (upsampfac=2, so nf>=2N in each dim)
+ For CPU code a little tricky to decide ntrans>1, multithreading.
+
+
+
+TO NOT DO:
+
+* PR 306 py expose spread/interp, leave for now
+
+
+DONE:
+
+folder struct reorg
+
+Reinecke perf improvements CPU: binsorting; interp in 2d/3d.
+
+1.3 legacy cuf repo.
+
+docs
+
diff --git a/devel/plans_fall23.txt b/devel/plans_fall23.txt
new file mode 100644
index 000000000..359d8cc8b
--- /dev/null
+++ b/devel/plans_fall23.txt
@@ -0,0 +1,27 @@
+FINUFFT (CPU+GPU) plans, Fall 2023:
+
+pick meeting date
+
+* CPU spreader/interp, bring in Wenda's stuff -> Marco?
+
+* GPU type 3 Marco?
+
+* implement Reinecke's sparse sliced FFT, w/ many-vectors too. CPU
+Libin help?
+
+* binaries release with GH Releases feature (Assets?). No crucial.
+
+* CPU perf tests, standardized way to benchmark spread/interp, FFT, H<>D, etc.
+Add to docs/devnotes.rst how to run benchmarks.
+Robert + Joakim
+
+* tutorial in docs: on eg inverse FFT by CG iteration.
+
+* doc for py local install #340
+
+* #340 docs for GPU simple interface
+
+Go through Issues & prioritize.
+
+PRs.
+
diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
index 4c6dc7175..21a4e3170 100644
--- a/docs/c_gpu.rst
+++ b/docs/c_gpu.rst
@@ -295,10 +295,8 @@ To create such a structure, use:
.. code-block:: c
cufinufft_opts opts;
- ier = cufinufft_default_opts(type, dim, &opts);
+ cufinufft_default_opts(&opts);
-where ``type`` and ``dim`` are as above in ``cufinufft_makeplan``.
-As before, the returned value is zero if success, otherwise an error occurred.
Then you may change fields of ``opts`` by hand, finally pass ``&opts`` in as the last argument to ``cufinufft_makeplan`` or ``cufinufftf_makeplan``.
The options fields are currently only documented in the ``include/cufinufft_opts.h``.
diff --git a/docs/cex.rst b/docs/cex.rst
index 1f824bc8b..b1b9b2a1f 100644
--- a/docs/cex.rst
+++ b/docs/cex.rst
@@ -217,8 +217,8 @@ See ``examples/many1d1.cpp`` and ``test/finufft?dmany_test.cpp``
for more examples.
-Guru interface example
-----------------------
+Guru interface examples
+-----------------------
If you want more flexibility than the above, use the "guru" interface:
this is similar to that of FFTW3, and to the main interface of
@@ -268,42 +268,40 @@ is needed since the Fourier coefficient dimensions are passed as an array.
The complete code with a math test is in ``examples/guru2d1.cpp``, and for
more examples see ``examples/guru1d1*.c*``
+Using the guru interface to perform a vectorized transform (multiple 1D type 1
+transforms each with the same nonuniform points) is demonstrated in
+``examples.gurumany1d1.cpp``. This is similar to the single-command vectorized
+interface, but allowing more control (changing the nonuniform points without
+re-planning the FFT, etc).
-Thread safety and global state
-------------------------------
+
+Thread safety for single-threaded transforms, and global state
+--------------------------------------------------------------
It is possible to call FINUFFT from within multithreaded code, e.g. in an
-OpenMP parallel block. In this case ``opts.nthreads=1`` should be set,
-and FINUFFT must have been compiled with the ``-DFFTW_PLAN_SAFE`` flag,
-making it thread-safe.
-For demos of this, see
+OpenMP parallel block. In this case ``opts.nthreads=1`` should be set, otherwise
+a segfault will occur. This is useful if you don't want to synchronize
+independent transforms.
+For demos of this "parallelize over single-threaded transforms" use case, see
+the following, which are built as part of the ``make examples`` task:
* ``examples/threadsafe1d1`` which runs a 1D type-1 separately on each thread, checking the math, and
-* ``examples/threadsafe2d2f`` which runs a 2D type-2 on each slice, which are parallelized over via an OpenMP parallel for loop (without any math check, just dummy inputs)
-
-which are both built by ``make examples`` if the above flag has been set.
-
-Note: A design decision of FFTW is to have a global state which stores
-wisdom and settings. Such global state can cause unforeseen effects on other routines that also use FFTW. In contrast, FINUFFT uses pointers to plans to store
-its state, and does not have a global state (other than one ``static``
-flag used as a lock on FFTW initialization in the FINUFFT plan
-stage). This means different FINUFFT calls should not affect each other,
-although they may affect other codes that use FFTW via FFTW's global state.
-
-Alternatively it is possible to achieve thread safety without compiling with ``-DFFTW_PLAN_SAFE``. This can lead to higher performance.
-This way to achieve thread safety is to use the guru interface and split initialization and transformation into two steps. The initialization step is not thread safe, but the transformation step is. The initialization step can be parallelized over threads, but the transformation step cannot. See ``examples/threadsafe1d1`` for an example of this.
-In particular:
-
-.. code-block:: C++
+* ``examples/threadsafe2d2f`` which runs a 2D type-2 on each "slice" (in the MRI
+ language), parallelized over slices with an OpenMP parallel for loop.
+ (In this code there is no math check, just status check.)
- finufft_makeplan(...args...) // not thread safe
+However, if you have multiple transforms with the *same* nonuniform points for
+each transform, it is probably much faster to use the vectorized interface,
+and do all these transforms with a single such multithreaded FINUFFT call
+(see ``examples/many1d1.cpp`` and ``examples/gurumany1d1.cpp``).
+This may be less convenient if you want to leave your slices unsynchronized.
-All the other functions are thread safe. With OpenMP is possible to use:
-
-.. code-block:: C++
-
- #pragma omp critical
- finufft_makeplan(...args...) // not thread safe
-
-to achieve thread safety.
+.. note::
+ A design decision of FFTW is to have a global state which stores
+ wisdom and settings. Such global state can cause unforeseen effects on other
+ routines that also use FFTW. In contrast, FINUFFT uses pointers to plans to store
+ its state, and does not have a global state (other than one ``static``
+ flag used as a lock on FFTW initialization in the FINUFFT plan
+ stage). This means different FINUFFT calls should not affect each other,
+ although they may affect other codes that use FFTW via FFTW's global state.
diff --git a/docs/conf.py b/docs/conf.py
index 2148db4ff..f67d4b6a0 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,10 +21,11 @@
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, os.path.abspath('../python'))
+sys.path.insert(0, os.path.abspath('../python/finufft'))
+sys.path.insert(0, os.path.abspath('../python/cufinufft'))
# quash sphinx's inevitable failure of C++ lib extension to import...
-autodoc_mock_imports = ['finufft._finufft', 'numpy']
+autodoc_mock_imports = ['finufft._finufft', 'numpy', 'pycuda', 'cufinufft._cufinufft']
# The above is not enough for nested import -- forcibly mock them out ahead of time:
#for name in autodoc_mock_imports:
# sys.modules[name] = sphinx.ext.autodoc._MockModule(name, None)
@@ -73,9 +74,9 @@
# built documents.
#
# The short X.Y version.
-version = u'2.1'
+version = u'2.2'
# The full version, including alpha/beta/rc tags.
-release = u'2.1.0'
+release = u'2.2.0.dev0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
diff --git a/docs/devnotes.rst b/docs/devnotes.rst
index a8229e549..96fb181a8 100644
--- a/docs/devnotes.rst
+++ b/docs/devnotes.rst
@@ -9,8 +9,8 @@ Developer notes
- ``CMakeLists.txt`` for cmake
- ``docs/conf.py`` for sphinx
- - ``python/setup.py`` for the python pkg version
- - ``python/finufft/__init__.py``
+ - ``python/finufft/setup.py`` for the python pkg version
+ - ``python/cufinufft/setup.py`` for the GPU python pkg version
- ``include/finufft/defs.h`` for the debug>0 output
- ``matlab/Contents.m`` for the MATLAB/Octave help
- ``CHANGELOG``: don't forget to describe the new features and changes, folding lines at 80 chars.
@@ -21,14 +21,14 @@ Developer notes
* For testing and performance measuring routines see ``test/README`` and ``perftest/README``. We need more of the latter, eg, something making performance graphs that enable rapid eyeball comparison of various settings/machines.
-* Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build variants OMP=OFF nor FFTW_PLAN_SAFE options. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
+* Continuous Integration (CI). See files for this in ``.github/workflows/``. It currently tests the default ``makefile`` settings in linux, and three other ``make.inc.*`` files covering OSX and Windows (MinGW). CI does not test build the variant OMP=OFF. The dev should test these locally. Likewise, the Julia wrapper is separate and thus not tested in CI. We have added ``JenkinsFile`` for the GPU CI via python wrappers.
* **Installing MWrap**. This is needed only for experts to rebuild the matlab/octave interfaces.
`MWrap `_
is a very useful MEX interface generator by Dave Bindel, now maintained
and expanded by Zydrunas Gimbutas.
Make sure you have ``flex`` and ``bison`` installed to build it.
- As of FINUFFT v.2.0 you will need a recent (>=0.33.10) version of MWrap.
+ As of FINUFFT v2.0 you will need a recent (>=0.33.10) version of MWrap.
Make sure to override the location of MWrap by adding a line such as::
MWRAP = your-path-to-mwrap-executable
diff --git a/docs/error.rst b/docs/error.rst
index 68df90c62..e40d8e256 100644
--- a/docs/error.rst
+++ b/docs/error.rst
@@ -5,7 +5,7 @@ Error (status) codes
In all FINUFFT interfaces, the returned value ``ier`` is a status indicator.
It is ``0`` if successful, otherwise the error code
-has the following meanings (see ``include/defs.h``):
+has the following meanings (see ``include/finufft_errors.h``):
::
@@ -22,7 +22,13 @@ has the following meanings (see ``include/defs.h``):
11 general allocation failure
12 dimension invalid
13 spread_thread option invalid
-
+ 14 invalid mode array (more than ~2^31 modes, dimension with 0 modes, etc)
+ 15 cuda failure (failure to call any cuda function/kernel)
+ 16 attempt to destroy an uninitialized plan
+ 17 invalid spread/interp method for dim (attempt to blockgather in 1D, e.g.)
+ 18 size of bins for subprob/blockgather invalid
+ 19 GPU shmem too small for subprob/blockgather parameters
+
When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
For any other nonzero values of ``ier`` the transform may not have been performed and the output should not be trusted. However, we hope that the value of ``ier`` will help to narrow down the problem.
diff --git a/docs/install.rst b/docs/install.rst
index 6d379580e..3cefefbfa 100644
--- a/docs/install.rst
+++ b/docs/install.rst
@@ -26,7 +26,7 @@ Then add the following to your ``CMakeLists.txt``:
CPMAddPackage(
NAME Finufft
GIT_REPOSITORY https://github.com/flatironinstitute/finufft.git
- GIT_TAG v2.1.0
+ GIT_TAG master
GIT_SHALLOW Yes
GIT_PROGRESS Yes
EXCLUDE_FROM_ALL Yes
@@ -81,6 +81,10 @@ For example, to configure, build and test the development preset (which builds t
From other CMake projects, to use ``finufft`` as a library, simply add this repository as a subdirectory using
``add_subdirectory``, and use ``target_link_library(your_executable finufft)``.
+.. note::
+
+ CMake compiling on linux at Flatiron Institute (Rusty cluster). We have had a report that if you want to use LLVM, you need to ``module load llvm/16.0.3`` otherwise the default ``llvm/14.0.6`` does not find ``OpenMP_CXX``.
+
Old GNU make based route
------------------------
@@ -260,8 +264,6 @@ This is for experts.
Here are all the flags that the FINUFFT source responds to.
Activate them by adding a line of the form ``CXXFLAGS+=-DMYFLAG`` in your ``make.inc``:
-* ``-DFFTW_PLAN_SAFE``: This makes FINUFFT call ``fftw_make_planner_thread_safe()`` as part of its FFTW3 planner stage; see http://www.fftw.org/fftw3_doc/Thread-safety.html. This makes FINUFFT thread-safe. See ``examples/threadsafe1d1.cpp`` and ``examples/threadsafe2d2f.cpp``. This is only available in FFTW version >=3.3.6; for this reason it is not yet the default. If you get segfault on these examples, try ``FFTWOMPSUFFIX = threads`` as explained below.
-
* ``-DSINGLE``: This is internally used by our build process to switch
(via preprocessor macros) the source from double to single precision.
You should not need to use this flag yourself.
@@ -356,14 +358,20 @@ The GCC route
~~~~~~~~~~~~~~
This is less recommended, unless you need to link from ``gfortran``, when it
-appears to be essential. We have tested on Movaje::
+appears to be essential. The basic idea is::
- cp make.inc.macosx_gcc-8 make.inc
+ cp make.inc.macosx_gcc-12 make.inc
make test -j
make fortran
which also compiles and tests the fortran interfaces.
-In Catalina you'll probably need to edit to ``g++-10`` in your ``make.inc``.
+You may need to edit to ``g++-11``, or whatever your GCC version is,
+in your ``make.inc``.
+
+.. note::
+
+ A problem between GCC and the new XCode 15 requires a workaround to add ``LDFLAGS+=-ld64`` to force the old linker to be used. See the above file ``make.inc.macosx_gcc-12``.
+
We find python may be built as :ref:`below`.
We found that octave interfaces do not work with GCC; please help.
For MATLAB, the MEX settings may need to be
diff --git a/docs/python_gpu.rst b/docs/python_gpu.rst
index 6ebbcfc59..140c8b557 100644
--- a/docs/python_gpu.rst
+++ b/docs/python_gpu.rst
@@ -66,3 +66,4 @@ Full documentation
.. automodule:: cufinufft
:members:
:member-order: bysource
+ :undoc-members:
diff --git a/docs/trouble.rst b/docs/trouble.rst
index 4d9711600..2b7b8114a 100644
--- a/docs/trouble.rst
+++ b/docs/trouble.rst
@@ -70,7 +70,7 @@ Crash (segfault) issues and advice
- Maybe you have switched off nonuniform point bounds checking (``opts.chkbnds=0``) for a little extra speed? Try switching it on again to catch illegal coordinates.
-- Thread-safety: are you calling FINUFFT from inside a multithreaded block of code, without compiling with ``-DFFTW_PLAN_SAFE`` or setting ``opts.nthreads=1``? If ``gdb`` indicates crashes during FFTW calls, this is another sign.
+- Thread-safety: are you calling FINUFFT from inside a multithreaded block of code without setting ``opts.nthreads=1``? If ``gdb`` indicates crashes during FFTW calls, this is another sign.
- To isolate where a crash is occurring, set ``opts.debug`` to 1 or 2, and check the text output of the various stages. With a debug setting of 2 or above, when ``ntrans>1`` a large amount of text can be generated.
diff --git a/docs/tut.rst b/docs/tut.rst
index 19476052c..7dcc07b87 100644
--- a/docs/tut.rst
+++ b/docs/tut.rst
@@ -27,5 +27,5 @@ For further applications, see :ref:`references `, and:
* The numerical sampling of `random plane waves `_.
-* These seminar `PDF slides `_.
+* These seminar `PDF slides `_.
diff --git a/docs/users.rst b/docs/users.rst
index bbcd80ee7..e689a9e94 100644
--- a/docs/users.rst
+++ b/docs/users.rst
@@ -76,7 +76,10 @@ For the latest see: Google Scholar `FINUFFT citations
#include
#include
+#include
// only good for small projects...
using namespace std;
@@ -18,10 +19,10 @@ int main(int argc, char* argv[])
pointers to STL vectors of C++ double complex numbers, with a math check.
Barnett 2/27/20
- Compile on linux with:
- g++-7 -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 -lfftw3 -lfftw3_omp -lm
+ Compile on linux with (or see ../makefile):
+ g++ -std=c++14 -fopenmp guru1d1.cpp -I../include ../lib-static/libfinufft.a -o guru1d1 -lfftw3 -lfftw3_omp -lm
- Or if you have built a single-core library, remove -fopenmp and -lfftw3_omp
+ Or if you have built a single-thread library, remove -fopenmp and -lfftw3_omp
Usage: ./guru1d1
*/
@@ -49,7 +50,7 @@ int main(int argc, char* argv[])
for (int j=0; j> c(M);
@@ -58,17 +59,18 @@ int main(int argc, char* argv[])
// alloc output array for the Fourier modes, then do the transform
vector> F(N);
- int ier = finufft_execute(plan, &c[0], &F[0]);
+ int ier = finufft_execute(plan, c.data(), F.data());
// for fun, do another with same NU pts (no re-sorting), but new strengths...
for (int j=0; j=-(double)N/2 && n<(double)N/2); // ensure meaningful test
complex Ftest = complex(0,0);
for (int j=0; j
+#include
+
+// specific to this demo...
+#include
+#include
+#include
+#include
+#include
+
+// only good for small projects...
+using namespace std;
+// allows 1i to be the imaginary unit... (C++14 onwards)
+using namespace std::complex_literals;
+
+int main(int argc, char* argv[])
+{
+ int M = 2e5; // number of nonuniform points
+ int N = 1e5; // number of modes
+ double tol = 1e-9; // desired accuracy
+ int ntrans = 100; // request a bunch of transforms in the execute
+ int isign = +1; // sign of i in the transform math definition
+
+ int type = 1, dim = 1; // 1d1
+ int64_t Ns[3] = {N,0,0}; // guru describes mode array by vector [N1,N2..]
+ finufft_plan plan; // creates a plan struct (NULL below: default opts)
+ finufft_makeplan(type, dim, Ns, isign, ntrans, tol, &plan, NULL);
+
+ // generate random nonuniform points and pass to FINUFFT
+ vector x(M);
+ for (int j=0; j> c(M*ntrans); // plain contiguous storage
+ for (int j=0; j> F(N*ntrans);
+ printf("guru many 1D type-1 double-prec, tol=%.3g, executing %d transforms (vectorized), each size %d NU pts to %d modes...\n",tol,ntrans,M,N);
+ int ier = finufft_execute(plan, c.data(), F.data());
+
+ // could now change c, do another execute, do another setpts, execute, etc...
+
+ finufft_destroy(plan); // don't forget! we're done with transforms of this size
+
+ // rest is math checking and reporting...
+ int k = 42519; // check the answer just for this mode
+ int trans = 71; // ...testing in just this transform
+ assert(k>=-(double)N/2 && k<(double)N/2); // ensure meaningful test
+ assert(trans>=0 && trans Ftest = complex(0,0);
+ for (int j=0; jFmax) Fmax=aF;
+ }
+ int nout = k+N/2 + N*trans; // output index for freq mode k in the trans
+ double err = abs(F[nout] - Ftest)/Fmax;
+ printf("\tdone: ier=%d; for transform %d, rel err in F[%d] is %.3g\n",ier,trans,k,err);
+
+ return ier;
+}
diff --git a/examples/threadsafe1d1.cpp b/examples/threadsafe1d1.cpp
index 5656df7d5..f25f25b8b 100644
--- a/examples/threadsafe1d1.cpp
+++ b/examples/threadsafe1d1.cpp
@@ -13,13 +13,12 @@ using namespace std;
int main(int argc, char* argv[])
/* Demo single-threaded FINUFFT calls from inside a OMP parallel block.
Adapted from simple1d1.cpp: C++, STL double complex vectors, with math test.
- Barnett 4/19/21, eg for Goran Zauhar, issue #183.
+ Barnett 4/19/21, eg for Goran Zauhar, issue #183. Also see: many1d1.cpp.
- Notes: libfinufft *must* have been built with -DFFTW_PLAN_SAFE, which needs
- FFTW >= 3.3.6. You also may not have libfftw3_omp, so I have switched to
+ Notes: You may not have libfftw3_omp, so I have switched to
libfftw3_threads in this suggested compile command:
- g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1 -lfftw3 -lfftw3_threads -lm
+ g++ -fopenmp threadsafe1d1.cpp -I../include ../lib/libfinufft.so -o threadsafe1d1
Usage: ./threadsafe1d1
@@ -34,21 +33,22 @@ int main(int argc, char* argv[])
finufft_default_opts(opts);
complex I = complex(0.0,1.0); // the imaginary unit
- // generate some random nonuniform points (x) and complex strengths (c)...
- vector x(M);
- for (int j=0; jnthreads=1; // this is *crucial* so that each call single-thread
+ opts->nthreads=1; // *crucial* so that each call single-thread (otherwise segfaults)
// Now have each thread do independent 1D type 1 on their own data:
#pragma omp parallel
{
- // generate some complex strengths (c)... local to the thread
+ // generate some random nonuniform points (x) and complex strengths (c)...
+ // Note that these are local to the thread (if you have the *same* sets of
+ // NU pts x for each thread, consider instead using one vectorized multithreaded
+ // transform, which would be faster).
+ vector x(M);
vector > c(M);
- for (int j=0; j > F(N);
@@ -70,5 +70,6 @@ int main(int argc, char* argv[])
printf("[thread %2d] 1D t-1 dbl-prec NUFFT done. ier=%d, rel err in F[%d]: %.3g\n",omp_get_thread_num(),ier,k,err);
}
+
return 0;
}
diff --git a/examples/threadsafe2d2f.cpp b/examples/threadsafe2d2f.cpp
index c7ccfde67..9844af54a 100644
--- a/examples/threadsafe2d2f.cpp
+++ b/examples/threadsafe2d2f.cpp
@@ -1,15 +1,22 @@
/* This is a 2D type-2 demo calling single-threaded FINUFFT inside an OpenMP
- loop, to show thread-safety. It is closely based on a test code of Penfe,
+ loop, to show thread-safety with independent transforms, one per thread.
+ It is based on a test code of Penfe,
submitted github Issue #72. Unlike threadsafe1d1, it does not test the math;
- it is the shell of an application from MRI reconstruction.
+ it is the shell of an application from multi-coil/slice MRI reconstruction.
+ Note that since the NU pts are the same in each slice, in fact a vectorized
+ multithreaded transform could do all these slices together, and faster.
- FINUFFT lib must be built with (non-default) flag -DFFTW_PLAN_SAFE, and
- FFTW version >= 3.3.6. See http://www.fftw.org/fftw3_doc/Thread-safety.html
+ To compile (note uses threads rather than omp version of FFTW3):
- Then to compile (note uses threads rather than omp version of FFTW3):
+ g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -g -Wall
- g++ -fopenmp threadsafe2d2f.cpp -I../include ../lib/libfinufft.so -o threadsafe2d2f -lfftw3 -lfftw3_threads -lm -g -Wall
+ ./threadsafe2d2f <-- use all threads
+ OMP_NUM_THREADS=1 ./threadsafe2d2f <-- sequential, 1 thread
+ Expected output is 50 lines, each showing exit code 0. It's ok if they're
+ mangled due to threads writing to stdout simultaneously.
+
+ Barnett, tidied 11/22/23
*/
// this is all you must include for the finufft lib...
@@ -22,34 +29,37 @@
#include
using namespace std;
-void test_finufft(finufft_opts* opts)
-// self-contained small test that single-prec FINUFFT2D2 no error nor crash
+int test_finufft(finufft_opts* opts)
+ // self-contained small test that one single-prec FINUFFT2D2 has no error/crash
{
- size_t n_rows = 8, n_cols = 8;
- size_t n_read = 16, n_spokes = 4;
- std::vector x(n_read * n_spokes); // bunch of zero data
- std::vector y(n_read * n_spokes);
- std::vector> img(n_rows * n_cols);
- std::vector> ksp(n_read * n_spokes);
+ size_t n_rows = 256, n_cols = 256; // 2d image size
+ size_t n_read = 512, n_spokes = 128; // some k-space point params
+ size_t M = n_read*n_spokes; // how many k-space pts; MRI-specific
+ std::vector x(M); // bunch of zero input data
+ std::vector y(M);
+ std::vector> img(n_rows * n_cols); // coeffs
+ std::vector> ksp(M); // output array (vals @ k-space pts)
- int ier = finufftf2d2(n_read * n_spokes, x.data(), y.data(), ksp.data(),
- -1, 1e-3, n_rows, n_cols, img.data(), opts);
+ int ier = finufftf2d2(M, x.data(), y.data(), ksp.data(),
+ -1, 1e-3, n_rows, n_cols, img.data(), opts);
- std::cout << "\ttest_finufft: " << ier << ", thread " << omp_get_thread_num() << std::endl;
+ std::cout << "\ttest_finufft: exit code " << ier << ", thread " << omp_get_thread_num() << std::endl;
+ return ier;
}
int main(int argc, char* argv[])
{
finufft_opts opts;
finufftf_default_opts(&opts);
- opts.nthreads = 1; // this is *crucial* so that each call single-thread
-
- int n_slices = 50; // parallelize over slices
-#pragma omp parallel for num_threads(8)
- for (int i = 0; i < n_slices; i++)
- {
- test_finufft(&opts);
- }
+ opts.nthreads = 1; // *crucial* so each call single-thread; else segfaults
+
+ int n_slices = 50; // number of transforms. parallelize over slices
+ int overallstatus=0;
+#pragma omp parallel for
+ for (int i = 0; i < n_slices; i++) {
+ int ier = test_finufft(&opts);
+ if (ier!=0) overallstatus=1;
+ }
- return 0;
+ return overallstatus;
}
diff --git a/include/cufinufft.h b/include/cufinufft.h
index 106ea0d40..17ed6202b 100644
--- a/include/cufinufft.h
+++ b/include/cufinufft.h
@@ -1,8 +1,8 @@
// Defines the C++/C user interface to CUFINUFFT library.
#include
-#include
#include
+#include
typedef struct cufinufft_plan_s *cufinufft_plan;
typedef struct cufinufft_fplan_s *cufinufftf_plan;
@@ -10,12 +10,12 @@ typedef struct cufinufft_fplan_s *cufinufftf_plan;
#ifdef __cplusplus
extern "C" {
#endif
-int cufinufft_default_opts(int type, int dim, cufinufft_opts *opts);
+void cufinufft_default_opts(cufinufft_opts *opts);
-int cufinufft_makeplan(int type, int dim, int64_t *n_modes, int iflag, int ntr, double eps, cufinufft_plan *d_plan_ptr,
- cufinufft_opts *opts);
-int cufinufftf_makeplan(int type, int dim, int64_t *n_modes, int iflag, int ntr, float eps, cufinufftf_plan *d_plan_ptr,
- cufinufft_opts *opts);
+int cufinufft_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, double eps,
+ cufinufft_plan *d_plan_ptr, cufinufft_opts *opts);
+int cufinufftf_makeplan(int type, int dim, const int64_t *n_modes, int iflag, int ntr, float eps,
+ cufinufftf_plan *d_plan_ptr, cufinufft_opts *opts);
int cufinufft_setpts(cufinufft_plan d_plan, int M, double *h_kx, double *h_ky, double *h_kz, int N, double *h_s,
double *h_t, double *h_u);
diff --git a/include/cufinufft/contrib/cuda_samples/helper_cuda.h b/include/cufinufft/contrib/cuda_samples/helper_cuda.h
deleted file mode 100644
index 5378bfc11..000000000
--- a/include/cufinufft/contrib/cuda_samples/helper_cuda.h
+++ /dev/null
@@ -1,906 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-////////////////////////////////////////////////////////////////////////////////
-// These are CUDA Helper functions for initialization and error checking
-
-#ifndef COMMON_HELPER_CUDA_H_
-#define COMMON_HELPER_CUDA_H_
-
-#pragma once
-
-#include
-#include
-#include
-#include
-
-#include
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// Note, it is required that your SDK sample to include the proper header
-// files, please refer the CUDA examples for examples of the needed CUDA
-// headers, which may change depending on which CUDA functions are used.
-
-// CUDA Runtime error messages
-#ifdef __DRIVER_TYPES_H__
-static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
-#endif
-
-#ifdef CUDA_DRIVER_API
-// CUDA Driver API errors
-static const char *_cudaGetErrorEnum(CUresult error) {
- static char unknown[] = "";
- const char *ret = NULL;
- cuGetErrorName(error, &ret);
- return ret ? ret : unknown;
-}
-#endif
-
-#ifdef CUBLAS_API_H_
-// cuBLAS API errors
-static const char *_cudaGetErrorEnum(cublasStatus_t error) {
- switch (error) {
- case CUBLAS_STATUS_SUCCESS:
- return "CUBLAS_STATUS_SUCCESS";
-
- case CUBLAS_STATUS_NOT_INITIALIZED:
- return "CUBLAS_STATUS_NOT_INITIALIZED";
-
- case CUBLAS_STATUS_ALLOC_FAILED:
- return "CUBLAS_STATUS_ALLOC_FAILED";
-
- case CUBLAS_STATUS_INVALID_VALUE:
- return "CUBLAS_STATUS_INVALID_VALUE";
-
- case CUBLAS_STATUS_ARCH_MISMATCH:
- return "CUBLAS_STATUS_ARCH_MISMATCH";
-
- case CUBLAS_STATUS_MAPPING_ERROR:
- return "CUBLAS_STATUS_MAPPING_ERROR";
-
- case CUBLAS_STATUS_EXECUTION_FAILED:
- return "CUBLAS_STATUS_EXECUTION_FAILED";
-
- case CUBLAS_STATUS_INTERNAL_ERROR:
- return "CUBLAS_STATUS_INTERNAL_ERROR";
-
- case CUBLAS_STATUS_NOT_SUPPORTED:
- return "CUBLAS_STATUS_NOT_SUPPORTED";
-
- case CUBLAS_STATUS_LICENSE_ERROR:
- return "CUBLAS_STATUS_LICENSE_ERROR";
- }
-
- return "";
-}
-#endif
-
-#ifdef _CUFFT_H_
-// cuFFT API errors
-static const char *_cudaGetErrorEnum(cufftResult error) {
- switch (error) {
- case CUFFT_SUCCESS:
- return "CUFFT_SUCCESS";
-
- case CUFFT_INVALID_PLAN:
- return "CUFFT_INVALID_PLAN";
-
- case CUFFT_ALLOC_FAILED:
- return "CUFFT_ALLOC_FAILED";
-
- case CUFFT_INVALID_TYPE:
- return "CUFFT_INVALID_TYPE";
-
- case CUFFT_INVALID_VALUE:
- return "CUFFT_INVALID_VALUE";
-
- case CUFFT_INTERNAL_ERROR:
- return "CUFFT_INTERNAL_ERROR";
-
- case CUFFT_EXEC_FAILED:
- return "CUFFT_EXEC_FAILED";
-
- case CUFFT_SETUP_FAILED:
- return "CUFFT_SETUP_FAILED";
-
- case CUFFT_INVALID_SIZE:
- return "CUFFT_INVALID_SIZE";
-
- case CUFFT_UNALIGNED_DATA:
- return "CUFFT_UNALIGNED_DATA";
-
- case CUFFT_INCOMPLETE_PARAMETER_LIST:
- return "CUFFT_INCOMPLETE_PARAMETER_LIST";
-
- case CUFFT_INVALID_DEVICE:
- return "CUFFT_INVALID_DEVICE";
-
- case CUFFT_PARSE_ERROR:
- return "CUFFT_PARSE_ERROR";
-
- case CUFFT_NO_WORKSPACE:
- return "CUFFT_NO_WORKSPACE";
-
- case CUFFT_NOT_IMPLEMENTED:
- return "CUFFT_NOT_IMPLEMENTED";
-
- case CUFFT_LICENSE_ERROR:
- return "CUFFT_LICENSE_ERROR";
-
- case CUFFT_NOT_SUPPORTED:
- return "CUFFT_NOT_SUPPORTED";
- }
-
- return "";
-}
-#endif
-
-#ifdef CUSPARSEAPI
-// cuSPARSE API errors
-static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
- switch (error) {
- case CUSPARSE_STATUS_SUCCESS:
- return "CUSPARSE_STATUS_SUCCESS";
-
- case CUSPARSE_STATUS_NOT_INITIALIZED:
- return "CUSPARSE_STATUS_NOT_INITIALIZED";
-
- case CUSPARSE_STATUS_ALLOC_FAILED:
- return "CUSPARSE_STATUS_ALLOC_FAILED";
-
- case CUSPARSE_STATUS_INVALID_VALUE:
- return "CUSPARSE_STATUS_INVALID_VALUE";
-
- case CUSPARSE_STATUS_ARCH_MISMATCH:
- return "CUSPARSE_STATUS_ARCH_MISMATCH";
-
- case CUSPARSE_STATUS_MAPPING_ERROR:
- return "CUSPARSE_STATUS_MAPPING_ERROR";
-
- case CUSPARSE_STATUS_EXECUTION_FAILED:
- return "CUSPARSE_STATUS_EXECUTION_FAILED";
-
- case CUSPARSE_STATUS_INTERNAL_ERROR:
- return "CUSPARSE_STATUS_INTERNAL_ERROR";
-
- case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
- return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
- }
-
- return "";
-}
-#endif
-
-#ifdef CUSOLVER_COMMON_H_
-// cuSOLVER API errors
-static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
- switch (error) {
- case CUSOLVER_STATUS_SUCCESS:
- return "CUSOLVER_STATUS_SUCCESS";
- case CUSOLVER_STATUS_NOT_INITIALIZED:
- return "CUSOLVER_STATUS_NOT_INITIALIZED";
- case CUSOLVER_STATUS_ALLOC_FAILED:
- return "CUSOLVER_STATUS_ALLOC_FAILED";
- case CUSOLVER_STATUS_INVALID_VALUE:
- return "CUSOLVER_STATUS_INVALID_VALUE";
- case CUSOLVER_STATUS_ARCH_MISMATCH:
- return "CUSOLVER_STATUS_ARCH_MISMATCH";
- case CUSOLVER_STATUS_MAPPING_ERROR:
- return "CUSOLVER_STATUS_MAPPING_ERROR";
- case CUSOLVER_STATUS_EXECUTION_FAILED:
- return "CUSOLVER_STATUS_EXECUTION_FAILED";
- case CUSOLVER_STATUS_INTERNAL_ERROR:
- return "CUSOLVER_STATUS_INTERNAL_ERROR";
- case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
- return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
- case CUSOLVER_STATUS_NOT_SUPPORTED:
- return "CUSOLVER_STATUS_NOT_SUPPORTED ";
- case CUSOLVER_STATUS_ZERO_PIVOT:
- return "CUSOLVER_STATUS_ZERO_PIVOT";
- case CUSOLVER_STATUS_INVALID_LICENSE:
- return "CUSOLVER_STATUS_INVALID_LICENSE";
- }
-
- return "";
-}
-#endif
-
-#ifdef CURAND_H_
-// cuRAND API errors
-static const char *_cudaGetErrorEnum(curandStatus_t error) {
- switch (error) {
- case CURAND_STATUS_SUCCESS:
- return "CURAND_STATUS_SUCCESS";
-
- case CURAND_STATUS_VERSION_MISMATCH:
- return "CURAND_STATUS_VERSION_MISMATCH";
-
- case CURAND_STATUS_NOT_INITIALIZED:
- return "CURAND_STATUS_NOT_INITIALIZED";
-
- case CURAND_STATUS_ALLOCATION_FAILED:
- return "CURAND_STATUS_ALLOCATION_FAILED";
-
- case CURAND_STATUS_TYPE_ERROR:
- return "CURAND_STATUS_TYPE_ERROR";
-
- case CURAND_STATUS_OUT_OF_RANGE:
- return "CURAND_STATUS_OUT_OF_RANGE";
-
- case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
- return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
-
- case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
- return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
-
- case CURAND_STATUS_LAUNCH_FAILURE:
- return "CURAND_STATUS_LAUNCH_FAILURE";
-
- case CURAND_STATUS_PREEXISTING_FAILURE:
- return "CURAND_STATUS_PREEXISTING_FAILURE";
-
- case CURAND_STATUS_INITIALIZATION_FAILED:
- return "CURAND_STATUS_INITIALIZATION_FAILED";
-
- case CURAND_STATUS_ARCH_MISMATCH:
- return "CURAND_STATUS_ARCH_MISMATCH";
-
- case CURAND_STATUS_INTERNAL_ERROR:
- return "CURAND_STATUS_INTERNAL_ERROR";
- }
-
- return "";
-}
-#endif
-
-#ifdef NVJPEGAPI
-// nvJPEG API errors
-static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
- switch (error) {
- case NVJPEG_STATUS_SUCCESS:
- return "NVJPEG_STATUS_SUCCESS";
-
- case NVJPEG_STATUS_NOT_INITIALIZED:
- return "NVJPEG_STATUS_NOT_INITIALIZED";
-
- case NVJPEG_STATUS_INVALID_PARAMETER:
- return "NVJPEG_STATUS_INVALID_PARAMETER";
-
- case NVJPEG_STATUS_BAD_JPEG:
- return "NVJPEG_STATUS_BAD_JPEG";
-
- case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
- return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
-
- case NVJPEG_STATUS_ALLOCATOR_FAILURE:
- return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
-
- case NVJPEG_STATUS_EXECUTION_FAILED:
- return "NVJPEG_STATUS_EXECUTION_FAILED";
-
- case NVJPEG_STATUS_ARCH_MISMATCH:
- return "NVJPEG_STATUS_ARCH_MISMATCH";
-
- case NVJPEG_STATUS_INTERNAL_ERROR:
- return "NVJPEG_STATUS_INTERNAL_ERROR";
- }
-
- return "";
-}
-#endif
-
-#ifdef NV_NPPIDEFS_H
-// NPP API errors
-static const char *_cudaGetErrorEnum(NppStatus error) {
- switch (error) {
- case NPP_NOT_SUPPORTED_MODE_ERROR:
- return "NPP_NOT_SUPPORTED_MODE_ERROR";
-
- case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
- return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
-
- case NPP_RESIZE_NO_OPERATION_ERROR:
- return "NPP_RESIZE_NO_OPERATION_ERROR";
-
- case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
- return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
- case NPP_BAD_ARG_ERROR:
- return "NPP_BAD_ARGUMENT_ERROR";
-
- case NPP_COEFF_ERROR:
- return "NPP_COEFFICIENT_ERROR";
-
- case NPP_RECT_ERROR:
- return "NPP_RECTANGLE_ERROR";
-
- case NPP_QUAD_ERROR:
- return "NPP_QUADRANGLE_ERROR";
-
- case NPP_MEM_ALLOC_ERR:
- return "NPP_MEMORY_ALLOCATION_ERROR";
-
- case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
- return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
- case NPP_INVALID_INPUT:
- return "NPP_INVALID_INPUT";
-
- case NPP_POINTER_ERROR:
- return "NPP_POINTER_ERROR";
-
- case NPP_WARNING:
- return "NPP_WARNING";
-
- case NPP_ODD_ROI_WARNING:
- return "NPP_ODD_ROI_WARNING";
-#else
-
- // These are for CUDA 5.5 or higher
- case NPP_BAD_ARGUMENT_ERROR:
- return "NPP_BAD_ARGUMENT_ERROR";
-
- case NPP_COEFFICIENT_ERROR:
- return "NPP_COEFFICIENT_ERROR";
-
- case NPP_RECTANGLE_ERROR:
- return "NPP_RECTANGLE_ERROR";
-
- case NPP_QUADRANGLE_ERROR:
- return "NPP_QUADRANGLE_ERROR";
-
- case NPP_MEMORY_ALLOCATION_ERR:
- return "NPP_MEMORY_ALLOCATION_ERROR";
-
- case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
- return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
-
- case NPP_INVALID_HOST_POINTER_ERROR:
- return "NPP_INVALID_HOST_POINTER_ERROR";
-
- case NPP_INVALID_DEVICE_POINTER_ERROR:
- return "NPP_INVALID_DEVICE_POINTER_ERROR";
-#endif
-
- case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
- return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
-
- case NPP_TEXTURE_BIND_ERROR:
- return "NPP_TEXTURE_BIND_ERROR";
-
- case NPP_WRONG_INTERSECTION_ROI_ERROR:
- return "NPP_WRONG_INTERSECTION_ROI_ERROR";
-
- case NPP_NOT_EVEN_STEP_ERROR:
- return "NPP_NOT_EVEN_STEP_ERROR";
-
- case NPP_INTERPOLATION_ERROR:
- return "NPP_INTERPOLATION_ERROR";
-
- case NPP_RESIZE_FACTOR_ERROR:
- return "NPP_RESIZE_FACTOR_ERROR";
-
- case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
- return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
-
- case NPP_MEMFREE_ERR:
- return "NPP_MEMFREE_ERR";
-
- case NPP_MEMSET_ERR:
- return "NPP_MEMSET_ERR";
-
- case NPP_MEMCPY_ERR:
- return "NPP_MEMCPY_ERROR";
-
- case NPP_MIRROR_FLIP_ERR:
- return "NPP_MIRROR_FLIP_ERR";
-#else
-
- case NPP_MEMFREE_ERROR:
- return "NPP_MEMFREE_ERROR";
-
- case NPP_MEMSET_ERROR:
- return "NPP_MEMSET_ERROR";
-
- case NPP_MEMCPY_ERROR:
- return "NPP_MEMCPY_ERROR";
-
- case NPP_MIRROR_FLIP_ERROR:
- return "NPP_MIRROR_FLIP_ERROR";
-#endif
-
- case NPP_ALIGNMENT_ERROR:
- return "NPP_ALIGNMENT_ERROR";
-
- case NPP_STEP_ERROR:
- return "NPP_STEP_ERROR";
-
- case NPP_SIZE_ERROR:
- return "NPP_SIZE_ERROR";
-
- case NPP_NULL_POINTER_ERROR:
- return "NPP_NULL_POINTER_ERROR";
-
- case NPP_CUDA_KERNEL_EXECUTION_ERROR:
- return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
-
- case NPP_NOT_IMPLEMENTED_ERROR:
- return "NPP_NOT_IMPLEMENTED_ERROR";
-
- case NPP_ERROR:
- return "NPP_ERROR";
-
- case NPP_SUCCESS:
- return "NPP_SUCCESS";
-
- case NPP_WRONG_INTERSECTION_QUAD_WARNING:
- return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
-
- case NPP_MISALIGNED_DST_ROI_WARNING:
- return "NPP_MISALIGNED_DST_ROI_WARNING";
-
- case NPP_AFFINE_QUAD_INCORRECT_WARNING:
- return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
-
- case NPP_DOUBLE_SIZE_WARNING:
- return "NPP_DOUBLE_SIZE_WARNING";
-
- case NPP_WRONG_INTERSECTION_ROI_WARNING:
- return "NPP_WRONG_INTERSECTION_ROI_WARNING";
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
- /* These are 6.0 or higher */
- case NPP_LUT_PALETTE_BITSIZE_ERROR:
- return "NPP_LUT_PALETTE_BITSIZE_ERROR";
-
- case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
- return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
-
- case NPP_QUALITY_INDEX_ERROR:
- return "NPP_QUALITY_INDEX_ERROR";
-
- case NPP_CHANNEL_ORDER_ERROR:
- return "NPP_CHANNEL_ORDER_ERROR";
-
- case NPP_ZERO_MASK_VALUE_ERROR:
- return "NPP_ZERO_MASK_VALUE_ERROR";
-
- case NPP_NUMBER_OF_CHANNELS_ERROR:
- return "NPP_NUMBER_OF_CHANNELS_ERROR";
-
- case NPP_COI_ERROR:
- return "NPP_COI_ERROR";
-
- case NPP_DIVISOR_ERROR:
- return "NPP_DIVISOR_ERROR";
-
- case NPP_CHANNEL_ERROR:
- return "NPP_CHANNEL_ERROR";
-
- case NPP_STRIDE_ERROR:
- return "NPP_STRIDE_ERROR";
-
- case NPP_ANCHOR_ERROR:
- return "NPP_ANCHOR_ERROR";
-
- case NPP_MASK_SIZE_ERROR:
- return "NPP_MASK_SIZE_ERROR";
-
- case NPP_MOMENT_00_ZERO_ERROR:
- return "NPP_MOMENT_00_ZERO_ERROR";
-
- case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
- return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
-
- case NPP_THRESHOLD_ERROR:
- return "NPP_THRESHOLD_ERROR";
-
- case NPP_CONTEXT_MATCH_ERROR:
- return "NPP_CONTEXT_MATCH_ERROR";
-
- case NPP_FFT_FLAG_ERROR:
- return "NPP_FFT_FLAG_ERROR";
-
- case NPP_FFT_ORDER_ERROR:
- return "NPP_FFT_ORDER_ERROR";
-
- case NPP_SCALE_RANGE_ERROR:
- return "NPP_SCALE_RANGE_ERROR";
-
- case NPP_DATA_TYPE_ERROR:
- return "NPP_DATA_TYPE_ERROR";
-
- case NPP_OUT_OFF_RANGE_ERROR:
- return "NPP_OUT_OFF_RANGE_ERROR";
-
- case NPP_DIVIDE_BY_ZERO_ERROR:
- return "NPP_DIVIDE_BY_ZERO_ERROR";
-
- case NPP_RANGE_ERROR:
- return "NPP_RANGE_ERROR";
-
- case NPP_NO_MEMORY_ERROR:
- return "NPP_NO_MEMORY_ERROR";
-
- case NPP_ERROR_RESERVED:
- return "NPP_ERROR_RESERVED";
-
- case NPP_NO_OPERATION_WARNING:
- return "NPP_NO_OPERATION_WARNING";
-
- case NPP_DIVIDE_BY_ZERO_WARNING:
- return "NPP_DIVIDE_BY_ZERO_WARNING";
-#endif
-
-#if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
- /* These are 7.0 or higher */
- case NPP_OVERFLOW_ERROR:
- return "NPP_OVERFLOW_ERROR";
-
- case NPP_CORRUPTED_DATA_ERROR:
- return "NPP_CORRUPTED_DATA_ERROR";
-#endif
- }
-
- return "";
-}
-#endif
-
-template
-void check(T result, char const *const func, const char *const file, int const line) {
- if (result) {
- fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result),
- _cudaGetErrorEnum(result), func);
- exit(EXIT_FAILURE);
- }
-}
-
-#ifdef __DRIVER_TYPES_H__
-// This will output the proper CUDA error strings in the event
-// that a CUDA host call returns an error
-#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
-
-// This will output the proper error string when calling cudaGetLastError
-#define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __getLastCudaError(const char *errorMessage, const char *file, const int line) {
- cudaError_t err = cudaGetLastError();
-
- if (cudaSuccess != err) {
- fprintf(stderr,
- "%s(%i) : getLastCudaError() CUDA error :"
- " %s : (%d) %s.\n",
- file, line, errorMessage, static_cast(err), cudaGetErrorString(err));
- exit(EXIT_FAILURE);
- }
-}
-
-// This will only print the proper error string when calling cudaGetLastError
-// but not exit program incase error detected.
-#define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
-
-inline void __printLastCudaError(const char *errorMessage, const char *file, const int line) {
- cudaError_t err = cudaGetLastError();
-
- if (cudaSuccess != err) {
- fprintf(stderr,
- "%s(%i) : getLastCudaError() CUDA error :"
- " %s : (%d) %s.\n",
- file, line, errorMessage, static_cast(err), cudaGetErrorString(err));
- }
-}
-#endif
-
-#ifndef MAX
-#define MAX(a, b) (a > b ? a : b)
-#endif
-
-// Float To Int conversion
-inline int ftoi(float value) { return (value >= 0 ? static_cast(value + 0.5) : static_cast(value - 0.5)); }
-
-// Beginning of GPU Architecture definitions
-inline int _ConvertSMVer2Cores(int major, int minor) {
- // Defines for GPU Architecture types (using the SM version to determine
- // the # of cores per SM
- typedef struct {
- int SM; // 0xMm (hexidecimal notation), M = SM Major version,
- // and m = SM minor version
- int Cores;
- } sSMtoCores;
-
- sSMtoCores nGpuArchCoresPerSM[] = {{0x30, 192}, {0x32, 192}, {0x35, 192}, {0x37, 192}, {0x50, 128},
- {0x52, 128}, {0x53, 128}, {0x60, 64}, {0x61, 128}, {0x62, 128},
- {0x70, 64}, {0x72, 64}, {0x75, 64}, {-1, -1}};
-
- int index = 0;
-
- while (nGpuArchCoresPerSM[index].SM != -1) {
- if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
- return nGpuArchCoresPerSM[index].Cores;
- }
-
- index++;
- }
-
- // If we don't find the values, we default use the previous one
- // to run properly
- printf("MapSMtoCores for SM %d.%d is undefined."
- " Default to use %d Cores/SM\n",
- major, minor, nGpuArchCoresPerSM[index - 1].Cores);
- return nGpuArchCoresPerSM[index - 1].Cores;
-}
-
-inline const char *_ConvertSMVer2ArchName(int major, int minor) {
- // Defines for GPU Architecture types (using the SM version to determine
- // the GPU Arch name)
- typedef struct {
- int SM; // 0xMm (hexidecimal notation), M = SM Major version,
- // and m = SM minor version
- const char *name;
- } sSMtoArchName;
-
- sSMtoArchName nGpuArchNameSM[] = {{0x30, "Kepler"}, {0x32, "Kepler"}, {0x35, "Kepler"}, {0x37, "Kepler"},
- {0x50, "Maxwell"}, {0x52, "Maxwell"}, {0x53, "Maxwell"}, {0x60, "Pascal"},
- {0x61, "Pascal"}, {0x62, "Pascal"}, {0x70, "Volta"}, {0x72, "Xavier"},
- {0x75, "Turing"}, {-1, "Graphics Device"}};
-
- int index = 0;
-
- while (nGpuArchNameSM[index].SM != -1) {
- if (nGpuArchNameSM[index].SM == ((major << 4) + minor)) {
- return nGpuArchNameSM[index].name;
- }
-
- index++;
- }
-
- // If we don't find the values, we default use the previous one
- // to run properly
- printf("MapSMtoArchName for SM %d.%d is undefined."
- " Default to use %s\n",
- major, minor, nGpuArchNameSM[index - 1].name);
- return nGpuArchNameSM[index - 1].name;
-}
-// end of GPU Architecture definitions
-
-#ifdef __CUDA_RUNTIME_H__
-// General GPU Device CUDA Initialization
-inline int gpuDeviceInit(int devID) {
- int device_count;
- checkCudaErrors(cudaGetDeviceCount(&device_count));
-
- if (device_count == 0) {
- fprintf(stderr, "gpuDeviceInit() CUDA error: "
- "no devices supporting CUDA.\n");
- exit(EXIT_FAILURE);
- }
-
- if (devID < 0) {
- devID = 0;
- }
-
- if (devID > device_count - 1) {
- fprintf(stderr, "\n");
- fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n", device_count);
- fprintf(stderr,
- ">> gpuDeviceInit (-device=%d) is not a valid"
- " GPU device. <<\n",
- devID);
- fprintf(stderr, "\n");
- return -devID;
- }
-
- int computeMode = -1, major = 0, minor = 0;
- checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, devID));
- checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
- checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
- if (computeMode == cudaComputeModeProhibited) {
- fprintf(stderr, "Error: device is running in , no threads can use cudaSetDevice().\n");
- return -1;
- }
-
- if (major < 1) {
- fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
- exit(EXIT_FAILURE);
- }
-
- checkCudaErrors(cudaSetDevice(devID));
- printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, _ConvertSMVer2ArchName(major, minor));
-
- return devID;
-}
-
-// This function returns the best GPU (with maximum GFLOPS)
-inline int gpuGetMaxGflopsDeviceId() {
- int current_device = 0, sm_per_multiproc = 0;
- int max_perf_device = 0;
- int device_count = 0;
- int devices_prohibited = 0;
-
- uint64_t max_compute_perf = 0;
- checkCudaErrors(cudaGetDeviceCount(&device_count));
-
- if (device_count == 0) {
- fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
- " no devices supporting CUDA.\n");
- exit(EXIT_FAILURE);
- }
-
- // Find the best CUDA capable GPU device
- current_device = 0;
-
- while (current_device < device_count) {
- int computeMode = -1, major = 0, minor = 0;
- checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
- checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
- checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
-
- // If this GPU is not running on Compute Mode prohibited,
- // then we can add it to the list
- if (computeMode != cudaComputeModeProhibited) {
- if (major == 9999 && minor == 9999) {
- sm_per_multiproc = 1;
- } else {
- sm_per_multiproc = _ConvertSMVer2Cores(major, minor);
- }
- int multiProcessorCount = 0, clockRate = 0;
- checkCudaErrors(
- cudaDeviceGetAttribute(&multiProcessorCount, cudaDevAttrMultiProcessorCount, current_device));
- checkCudaErrors(cudaDeviceGetAttribute(&clockRate, cudaDevAttrClockRate, current_device));
- uint64_t compute_perf = (uint64_t)multiProcessorCount * sm_per_multiproc * clockRate;
-
- if (compute_perf > max_compute_perf) {
- max_compute_perf = compute_perf;
- max_perf_device = current_device;
- }
- } else {
- devices_prohibited++;
- }
-
- ++current_device;
- }
-
- if (devices_prohibited == device_count) {
- fprintf(stderr, "gpuGetMaxGflopsDeviceId() CUDA error:"
- " all devices have compute mode prohibited.\n");
- exit(EXIT_FAILURE);
- }
-
- return max_perf_device;
-}
-
-// Initialization code to find the best CUDA Device
-inline int findCudaDevice(int argc, const char **argv) {
- int devID = 0;
-
- // If the command-line has a device number specified, use it
- if (checkCmdLineFlag(argc, argv, "device")) {
- devID = getCmdLineArgumentInt(argc, argv, "device=");
-
- if (devID < 0) {
- printf("Invalid command line parameter\n ");
- exit(EXIT_FAILURE);
- } else {
- devID = gpuDeviceInit(devID);
-
- if (devID < 0) {
- printf("exiting...\n");
- exit(EXIT_FAILURE);
- }
- }
- } else {
- // Otherwise pick the device with highest Gflops/s
- devID = gpuGetMaxGflopsDeviceId();
- checkCudaErrors(cudaSetDevice(devID));
- int major = 0, minor = 0;
- checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, devID));
- checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, devID));
- printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID, _ConvertSMVer2ArchName(major, minor),
- major, minor);
- }
-
- return devID;
-}
-
-inline int findIntegratedGPU() {
- int current_device = 0;
- int device_count = 0;
- int devices_prohibited = 0;
-
- checkCudaErrors(cudaGetDeviceCount(&device_count));
-
- if (device_count == 0) {
- fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
- exit(EXIT_FAILURE);
- }
-
- // Find the integrated GPU which is compute capable
- while (current_device < device_count) {
- int computeMode = -1, integrated = -1;
- checkCudaErrors(cudaDeviceGetAttribute(&computeMode, cudaDevAttrComputeMode, current_device));
- checkCudaErrors(cudaDeviceGetAttribute(&integrated, cudaDevAttrIntegrated, current_device));
- // If GPU is integrated and is not running on Compute Mode prohibited,
- // then cuda can map to GLES resource
- if (integrated && (computeMode != cudaComputeModeProhibited)) {
- checkCudaErrors(cudaSetDevice(current_device));
-
- int major = 0, minor = 0;
- checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, current_device));
- checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, current_device));
- printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", current_device,
- _ConvertSMVer2ArchName(major, minor), major, minor);
-
- return current_device;
- } else {
- devices_prohibited++;
- }
-
- current_device++;
- }
-
- if (devices_prohibited == device_count) {
- fprintf(stderr, "CUDA error:"
- " No GLES-CUDA Interop capable GPU found.\n");
- exit(EXIT_FAILURE);
- }
-
- return -1;
-}
-
-// General check for CUDA GPU SM Capabilities
-inline bool checkCudaCapabilities(int major_version, int minor_version) {
- int dev;
- int major = 0, minor = 0;
-
- checkCudaErrors(cudaGetDevice(&dev));
- checkCudaErrors(cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, dev));
- checkCudaErrors(cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, dev));
-
- if ((major > major_version) || (major == major_version && minor >= minor_version)) {
- printf(" Device %d: <%16s >, Compute SM %d.%d detected\n", dev, _ConvertSMVer2ArchName(major, minor), major,
- minor);
- return true;
- } else {
- printf(" No GPU device was found that can support "
- "CUDA compute capability %d.%d.\n",
- major_version, minor_version);
- return false;
- }
-}
-#endif
-
-// end of CUDA Helper Functions
-
-#endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/contrib/cuda_samples/helper_string.h b/include/cufinufft/contrib/cuda_samples/helper_string.h
deleted file mode 100644
index 9dfaa28d6..000000000
--- a/include/cufinufft/contrib/cuda_samples/helper_string.h
+++ /dev/null
@@ -1,357 +0,0 @@
-/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * * Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * * Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * * Neither the name of NVIDIA CORPORATION nor the names of its
- * contributors may be used to endorse or promote products derived
- * from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-// These are helper functions for the SDK samples (string parsing, timers, etc)
-#ifndef COMMON_HELPER_STRING_H_
-#define COMMON_HELPER_STRING_H_
-
-#include
-#include
-#include
-#include
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
-#ifndef _CRT_SECURE_NO_DEPRECATE
-#define _CRT_SECURE_NO_DEPRECATE
-#endif
-#ifndef STRCASECMP
-#define STRCASECMP _stricmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP _strnicmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result != 0)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf_s
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf_s
-#endif
-#else // Linux Includes
-#include
-#include
-
-#ifndef STRCASECMP
-#define STRCASECMP strcasecmp
-#endif
-#ifndef STRNCASECMP
-#define STRNCASECMP strncasecmp
-#endif
-#ifndef STRCPY
-#define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
-#endif
-
-#ifndef FOPEN
-#define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
-#endif
-#ifndef FOPEN_FAIL
-#define FOPEN_FAIL(result) (result == NULL)
-#endif
-#ifndef SSCANF
-#define SSCANF sscanf
-#endif
-#ifndef SPRINTF
-#define SPRINTF sprintf
-#endif
-#endif
-
-#ifndef EXIT_WAIVED
-#define EXIT_WAIVED 2
-#endif
-
-// CUDA Utility Helper Functions
-inline int stringRemoveDelimiter(char delimiter, const char *string) {
- int string_start = 0;
-
- while (string[string_start] == delimiter) {
- string_start++;
- }
-
- if (string_start >= static_cast(strlen(string) - 1)) {
- return 0;
- }
-
- return string_start;
-}
-
-inline int getFileExtension(char *filename, char **extension) {
- int string_length = static_cast(strlen(filename));
-
- while (filename[string_length--] != '.') {
- if (string_length == 0)
- break;
- }
-
- if (string_length > 0)
- string_length += 2;
-
- if (string_length == 0)
- *extension = NULL;
- else
- *extension = &filename[string_length];
-
- return string_length;
-}
-
-inline bool checkCmdLineFlag(const int argc, const char **argv, const char *string_ref) {
- bool bFound = false;
-
- if (argc >= 1) {
- for (int i = 1; i < argc; i++) {
- int string_start = stringRemoveDelimiter('-', argv[i]);
- const char *string_argv = &argv[i][string_start];
-
- const char *equal_pos = strchr(string_argv, '=');
- int argv_length = static_cast(equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
-
- int length = static_cast(strlen(string_ref));
-
- if (length == argv_length && !STRNCASECMP(string_argv, string_ref, length)) {
- bFound = true;
- continue;
- }
- }
- }
-
- return bFound;
-}
-
-// This function wraps the CUDA Driver API into a template function
-template
-inline bool getCmdLineArgumentValue(const int argc, const char **argv, const char *string_ref, T *value) {
- bool bFound = false;
-
- if (argc >= 1) {
- for (int i = 1; i < argc; i++) {
- int string_start = stringRemoveDelimiter('-', argv[i]);
- const char *string_argv = &argv[i][string_start];
- int length = static_cast(strlen(string_ref));
-
- if (!STRNCASECMP(string_argv, string_ref, length)) {
- if (length + 1 <= static_cast(strlen(string_argv))) {
- int auto_inc = (string_argv[length] == '=') ? 1 : 0;
- *value = (T)atoi(&string_argv[length + auto_inc]);
- }
-
- bFound = true;
- i = argc;
- }
- }
- }
-
- return bFound;
-}
-
-inline int getCmdLineArgumentInt(const int argc, const char **argv, const char *string_ref) {
- bool bFound = false;
- int value = -1;
-
- if (argc >= 1) {
- for (int i = 1; i < argc; i++) {
- int string_start = stringRemoveDelimiter('-', argv[i]);
- const char *string_argv = &argv[i][string_start];
- int length = static_cast(strlen(string_ref));
-
- if (!STRNCASECMP(string_argv, string_ref, length)) {
- if (length + 1 <= static_cast(strlen(string_argv))) {
- int auto_inc = (string_argv[length] == '=') ? 1 : 0;
- value = atoi(&string_argv[length + auto_inc]);
- } else {
- value = 0;
- }
-
- bFound = true;
- continue;
- }
- }
- }
-
- if (bFound) {
- return value;
- } else {
- return 0;
- }
-}
-
-inline float getCmdLineArgumentFloat(const int argc, const char **argv, const char *string_ref) {
- bool bFound = false;
- float value = -1;
-
- if (argc >= 1) {
- for (int i = 1; i < argc; i++) {
- int string_start = stringRemoveDelimiter('-', argv[i]);
- const char *string_argv = &argv[i][string_start];
- int length = static_cast(strlen(string_ref));
-
- if (!STRNCASECMP(string_argv, string_ref, length)) {
- if (length + 1 <= static_cast(strlen(string_argv))) {
- int auto_inc = (string_argv[length] == '=') ? 1 : 0;
- value = static_cast(atof(&string_argv[length + auto_inc]));
- } else {
- value = 0.f;
- }
-
- bFound = true;
- continue;
- }
- }
- }
-
- if (bFound) {
- return value;
- } else {
- return 0;
- }
-}
-
-inline bool getCmdLineArgumentString(const int argc, const char **argv, const char *string_ref, char **string_retval) {
- bool bFound = false;
-
- if (argc >= 1) {
- for (int i = 1; i < argc; i++) {
- int string_start = stringRemoveDelimiter('-', argv[i]);
- char *string_argv = const_cast(&argv[i][string_start]);
- int length = static_cast(strlen(string_ref));
-
- if (!STRNCASECMP(string_argv, string_ref, length)) {
- *string_retval = &string_argv[length + 1];
- bFound = true;
- continue;
- }
- }
- }
-
- if (!bFound) {
- *string_retval = NULL;
- }
-
- return bFound;
-}
-
-//////////////////////////////////////////////////////////////////////////////
-//! Find the path for a file assuming that
-//! files are found in the searchPath.
-//!
-//! @return the path if succeeded, otherwise 0
-//! @param filename name of the file
-//! @param executable_path optional absolute path of the executable
-//////////////////////////////////////////////////////////////////////////////
-inline char *sdkFindFilePath(const char *filename, const char *executable_path) {
- // defines a variable that is replaced with the name of the
- // executable
-
- // Typical relative search paths to locate needed companion files (e.g. sample
- // input data, or JIT source files) The origin for the relative search may be
- // the .exe file, a .bat file launching an .exe, a browser .exe launching the
- // .exe or .bat, etc
- const char *searchPath[] = {
- "./", // same dir
- "./data/", // same dir
- "../../../../Samples//", // up 4 in tree
- "../../../Samples//", // up 3 in tree
- "../../Samples//", // up 2 in tree
- "../../../../Samples//data/", // up 4 in tree
- "../../../Samples//data/", // up 3 in tree
- "../../Samples//data/", // up 2 in tree
- };
-
- // Extract the executable name
- std::string executable_name;
-
- if (executable_path != 0) {
- executable_name = std::string(executable_path);
-
-#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
- // Windows path delimiter
- size_t delimiter_pos = executable_name.find_last_of('\\');
- executable_name.erase(0, delimiter_pos + 1);
-
- if (executable_name.rfind(".exe") != std::string::npos) {
- // we strip .exe, only if the .exe is found
- executable_name.resize(executable_name.size() - 4);
- }
-
-#else
- // Linux & OSX path delimiter
- size_t delimiter_pos = executable_name.find_last_of('/');
- executable_name.erase(0, delimiter_pos + 1);
-#endif
- }
-
- // Loop over all search paths and return the first hit
- for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
- std::string path(searchPath[i]);
- size_t executable_name_pos = path.find("");
-
- // If there is executable_name variable in the searchPath
- // replace it with the value
- if (executable_name_pos != std::string::npos) {
- if (executable_path != 0) {
- path.replace(executable_name_pos, strlen(""), executable_name);
- } else {
- // Skip this path entry if no executable argument is given
- continue;
- }
- }
-
-#ifdef _DEBUG
- printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
-#endif
-
- // Test if the file exists
- path.append(filename);
- FILE *fp;
- FOPEN(fp, path.c_str(), "rb");
-
- if (fp != NULL) {
- fclose(fp);
- // File found
- // returning an allocated array here for backwards compatibility reasons
- char *file_path = reinterpret_cast(malloc(path.length() + 1));
- STRCPY(file_path, path.length() + 1, path.c_str());
- return file_path;
- }
-
- if (fp) {
- fclose(fp);
- }
- }
-
- // File not found
- return 0;
-}
-
-#endif // COMMON_HELPER_STRING_H_
diff --git a/include/cufinufft/contrib/helper_cuda.h b/include/cufinufft/contrib/helper_cuda.h
new file mode 100644
index 000000000..fe4010393
--- /dev/null
+++ b/include/cufinufft/contrib/helper_cuda.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * * Neither the name of NVIDIA CORPORATION nor the names of its
+ * contributors may be used to endorse or promote products derived
+ * from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef COMMON_HELPER_CUDA_H_
+#define COMMON_HELPER_CUDA_H_
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include
+
+static const char *_cudaGetErrorEnum(cudaError_t error) { return cudaGetErrorName(error); }
+
+// This will output the proper CUDA error strings in the event
+// that a CUDA host call returns an error
+#define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
+
+#define RETURN_IF_CUDA_ERROR \
+ { \
+ cudaError_t err = cudaGetLastError(); \
+ if (err != cudaSuccess) { \
+ printf("[%s] Error: %s\n", __func__, cudaGetErrorString(err)); \
+ return FINUFFT_ERR_CUDA_FAILURE; \
+ } \
+ }
+
+#define CUDA_FREE_AND_NULL(val) \
+ { \
+ check(cudaFree(val), #val, __FILE__, __LINE__); \
+ val = nullptr; \
+ }
+
+static const char *cufftGetErrorString(cufftResult error) {
+ switch (error) {
+ case CUFFT_SUCCESS:
+ return "CUFFT_SUCCESS";
+
+ case CUFFT_INVALID_PLAN:
+ return "CUFFT_INVALID_PLAN";
+
+ case CUFFT_ALLOC_FAILED:
+ return "CUFFT_ALLOC_FAILED";
+
+ case CUFFT_INVALID_TYPE:
+ return "CUFFT_INVALID_TYPE";
+
+ case CUFFT_INVALID_VALUE:
+ return "CUFFT_INVALID_VALUE";
+
+ case CUFFT_INTERNAL_ERROR:
+ return "CUFFT_INTERNAL_ERROR";
+
+ case CUFFT_EXEC_FAILED:
+ return "CUFFT_EXEC_FAILED";
+
+ case CUFFT_SETUP_FAILED:
+ return "CUFFT_SETUP_FAILED";
+
+ case CUFFT_INVALID_SIZE:
+ return "CUFFT_INVALID_SIZE";
+
+ case CUFFT_UNALIGNED_DATA:
+ return "CUFFT_UNALIGNED_DATA";
+
+ case CUFFT_INCOMPLETE_PARAMETER_LIST:
+ return "CUFFT_INCOMPLETE_PARAMETER_LIST";
+
+ case CUFFT_INVALID_DEVICE:
+ return "CUFFT_INVALID_DEVICE";
+
+ case CUFFT_PARSE_ERROR:
+ return "CUFFT_PARSE_ERROR";
+
+ case CUFFT_NO_WORKSPACE:
+ return "CUFFT_NO_WORKSPACE";
+
+ case CUFFT_NOT_IMPLEMENTED:
+ return "CUFFT_NOT_IMPLEMENTED";
+
+ case CUFFT_LICENSE_ERROR:
+ return "CUFFT_LICENSE_ERROR";
+
+ case CUFFT_NOT_SUPPORTED:
+ return "CUFFT_NOT_SUPPORTED";
+ }
+
+ return "";
+}
+
+template
+int check(T result, char const *const func, const char *const file, int const line) {
+ if (result) {
+ fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line, static_cast(result),
+ _cudaGetErrorEnum(result), func);
+ return FINUFFT_ERR_CUDA_FAILURE;
+ }
+
+ return 0;
+}
+
+#endif // COMMON_HELPER_CUDA_H_
diff --git a/include/cufinufft/defs.h b/include/cufinufft/defs.h
index 64b237759..6cdb84340 100644
--- a/include/cufinufft/defs.h
+++ b/include/cufinufft/defs.h
@@ -13,17 +13,6 @@
// FIXME: If cufft ever takes N > INT_MAX...
constexpr int32_t MAX_NF = std::numeric_limits::max();
-// Global error codes for the library...
-#define WARN_EPS_TOO_SMALL 1
-#define ERR_MAXNALLOC 2
-#define ERR_SPREAD_BOX_SMALL 3
-#define ERR_SPREAD_PTS_OUT_RANGE 4
-#define ERR_SPREAD_ALLOC 5
-#define ERR_SPREAD_DIR 6
-#define ERR_UPSAMPFAC_TOO_SMALL 7
-#define HORNER_WRONG_BETA 8
-#define ERR_NDATA_NOTVALID 9
-
// allow compile-time switch off of openmp, so compilation without any openmp
// is done (Note: _OPENMP is automatically set by -fopenmp compile flag)
#ifdef _OPENMP
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
index c508cb69f..10c459961 100644
--- a/include/cufinufft/impl.h
+++ b/include/cufinufft/impl.h
@@ -2,11 +2,10 @@
#define CUFINUFFT_IMPL_H
#include
-
-#include
#include
+#include
#include
#include
#include
@@ -15,6 +14,8 @@
#include
#include
+#include
+
// 1d
template
int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_plan_t *d_plan);
@@ -88,18 +89,22 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
Melody Shih 07/25/19. Use-facing moved to markdown, Barnett 2/16/21.
*/
- // Mult-GPU support: set the CUDA Device ID:
- int orig_gpu_device_id;
- cudaGetDevice(&orig_gpu_device_id);
- if (opts == NULL) {
- // options might not be supplied to this function => assume device
- // 0 by default
- cudaSetDevice(0);
- } else {
- cudaSetDevice(opts->gpu_device_id);
+ int ier;
+ cuDoubleComplex *d_a = nullptr; // fseries temp data
+ T *d_f = nullptr; // fseries temp data
+
+ if (type < 1 || type > 2) {
+ fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+ return FINUFFT_ERR_TYPE_NOTVALID;
+ }
+ if (ntransf < 1) {
+ fprintf(stderr, "[%s] Invalid ntransf (%d): should be at least 1.\n", __func__, ntransf);
+ return FINUFFT_ERR_NTRANS_NOTVALID;
}
- int ier;
+ // Mult-GPU support: set the CUDA Device ID:
+ const int device_id = opts == NULL ? 0 : opts->gpu_device_id;
+ cufinufft::utils::WithCudaDevice device_swapper(device_id);
/* allocate the plan structure, assign address to user pointer. */
cufinufft_plan_t *d_plan = new cufinufft_plan_t;
@@ -109,20 +114,36 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
/* If a user has not supplied their own options, assign defaults for them. */
if (opts == NULL) { // use default opts
- ier = cufinufft_default_opts(type, dim, &(d_plan->opts));
- if (ier != 0) {
- printf("error: cufinufft_default_opts returned error %d.\n", ier);
- return ier;
- }
+ cufinufft_default_opts(&(d_plan->opts));
} else { // or read from what's passed in
d_plan->opts = *opts; // keep a deep copy; changing *opts now has no effect
}
+ /* Automatically set GPU method. */
+ if (d_plan->opts.gpu_method == 0) {
+ /* For type 1, we default to method 2 (SM) since this is generally faster.
+ * However, in the special case of _double precision_ in _three dimensions_
+ * with more than _three digits of precision_, there is note enough shared
+ * memory for this to work. As a result, we will default to method 1 (GM) in
+ * this special case.
+ *
+ * For type 2, we always default to method 1 (GM). */
+ if (type == 1 && (sizeof(T) == 4 || dim < 3 || tol >= 1e-3))
+ d_plan->opts.gpu_method = 2;
+ else if (type == 1 && tol < 1e-3)
+ d_plan->opts.gpu_method = 1;
+ else if (type == 2)
+ d_plan->opts.gpu_method = 1;
+ }
+
/* Setup Spreader */
using namespace cufinufft::common;
- ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts);
- if (ier > 1) // proceed if success or warning
+ // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
+ if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
+ delete *d_plan_ptr;
+ *d_plan_ptr = nullptr;
return ier;
+ }
d_plan->dim = dim;
d_plan->ms = nmodes[0];
@@ -157,64 +178,82 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
using namespace cufinufft::memtransfer;
switch (d_plan->dim) {
case 1: {
- ier = allocgpumem1d_plan(d_plan);
+ if ((ier = allocgpumem1d_plan(d_plan)))
+ goto finalize;
} break;
case 2: {
- ier = allocgpumem2d_plan(d_plan);
+ if ((ier = allocgpumem2d_plan(d_plan)))
+ goto finalize;
} break;
case 3: {
- ier = allocgpumem3d_plan(d_plan);
+ if ((ier = allocgpumem3d_plan(d_plan)))
+ goto finalize;
} break;
}
cufftHandle fftplan;
+ cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};
- cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type(), maxbatchsize);
+ cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1, inembed[0], cufft_type(),
+ maxbatchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};
- cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1, inembed[0] * inembed[1],
- cufft_type(), maxbatchsize);
+ cufft_status = cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
+ inembed[0] * inembed[1], cufft_type(), maxbatchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};
- cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
- inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize);
+ cufft_status = cufftPlanMany(&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
+ inembed[0] * inembed[1] * inembed[2], cufft_type(), maxbatchsize);
} break;
}
- d_plan->fftplan = fftplan;
- std::complex a[3 * MAX_NQUAD];
- T f[3 * MAX_NQUAD];
- onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
- if (dim > 1) {
- onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
+ if (cufft_status != CUFFT_SUCCESS) {
+ fprintf(stderr, "[%s] cufft makeplan error: %s", __func__, cufftGetErrorString(cufft_status));
+ ier = FINUFFT_ERR_CUDA_FAILURE;
+ goto finalize;
}
- if (dim > 2) {
- onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
+ d_plan->fftplan = fftplan;
+ {
+ std::complex a[3 * MAX_NQUAD];
+ T f[3 * MAX_NQUAD];
+ onedim_fseries_kernel_precomp(nf1, f, a, d_plan->spopts);
+ if (dim > 1)
+ onedim_fseries_kernel_precomp(nf2, f + MAX_NQUAD, a + MAX_NQUAD, d_plan->spopts);
+ if (dim > 2)
+ onedim_fseries_kernel_precomp(nf3, f + 2 * MAX_NQUAD, a + 2 * MAX_NQUAD, d_plan->spopts);
+
+ if ((ier = checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)))))
+ goto finalize;
+ if ((ier = checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)))))
+ goto finalize;
+ if ((ier = checkCudaErrors(
+ cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice))))
+ goto finalize;
+ if ((ier = checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice))))
+ goto finalize;
+ if ((ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
+ d_plan->fwkerhalf3, d_plan->spopts.nspread)))
+ goto finalize;
}
- cuDoubleComplex *d_a;
- T *d_f;
- checkCudaErrors(cudaMalloc(&d_a, dim * MAX_NQUAD * sizeof(cuDoubleComplex)));
- checkCudaErrors(cudaMalloc(&d_f, dim * MAX_NQUAD * sizeof(T)));
- checkCudaErrors(cudaMemcpy(d_a, a, dim * MAX_NQUAD * sizeof(cuDoubleComplex), cudaMemcpyHostToDevice));
- checkCudaErrors(cudaMemcpy(d_f, f, dim * MAX_NQUAD * sizeof(T), cudaMemcpyHostToDevice));
- ier = cufserieskernelcompute(d_plan->dim, nf1, nf2, nf3, d_f, d_a, d_plan->fwkerhalf1, d_plan->fwkerhalf2,
- d_plan->fwkerhalf3, d_plan->spopts.nspread);
-
+finalize:
cudaFree(d_a);
cudaFree(d_f);
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
+
+ if (ier > 1) {
+ delete *d_plan_ptr;
+ *d_plan_ptr = nullptr;
+ }
return ier;
}
@@ -258,10 +297,7 @@ Notes: the type T means either single or double, matching the
Melody Shih 07/25/19; Barnett 2/16/21 moved out docs.
*/
{
- // Mult-GPU support: set the CUDA Device ID:
- int orig_gpu_device_id;
- cudaGetDevice(&orig_gpu_device_id);
- cudaSetDevice(d_plan->opts.gpu_device_id);
+ cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
int nf1 = d_plan->nf1;
int nf2 = d_plan->nf2;
@@ -283,6 +319,8 @@ Notes: the type T means either single or double, matching the
ier = allocgpumem3d_nupts(d_plan);
} break;
}
+ if (ier)
+ return ier;
d_plan->kx = d_kx;
if (dim > 1)
@@ -293,93 +331,28 @@ Notes: the type T means either single or double, matching the
using namespace cufinufft::spreadinterp;
switch (d_plan->dim) {
case 1: {
- if (d_plan->opts.gpu_method == 1) {
- ier = cuspread1d_nuptsdriven_prop(nf1, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return 1;
- }
- }
- if (d_plan->opts.gpu_method == 2) {
- ier = cuspread1d_subprob_prop(nf1, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return 1;
- }
- }
+ if (d_plan->opts.gpu_method == 1 && (ier = cuspread1d_nuptsdriven_prop(nf1, M, d_plan)))
+ fprintf(stderr, "error: cuspread1d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
+ if (d_plan->opts.gpu_method == 2 && (ier = cuspread1d_subprob_prop(nf1, M, d_plan)))
+ fprintf(stderr, "error: cuspread1d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
} break;
case 2: {
- if (d_plan->opts.gpu_method == 1) {
- ier = cuspread2d_nuptsdriven_prop(nf1, nf2, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return 1;
- }
- }
- if (d_plan->opts.gpu_method == 2) {
- ier = cuspread2d_subprob_prop(nf1, nf2, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return 1;
- }
- }
+ if (d_plan->opts.gpu_method == 1 && (ier = cuspread2d_nuptsdriven_prop(nf1, nf2, M, d_plan)))
+ fprintf(stderr, "error: cuspread2d_nupts_prop, method(%d)\n", d_plan->opts.gpu_method);
+ if (d_plan->opts.gpu_method == 2 && (ier = cuspread2d_subprob_prop(nf1, nf2, M, d_plan)))
+ fprintf(stderr, "error: cuspread2d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
} break;
case 3: {
- if (d_plan->opts.gpu_method == 4) {
- int ier = cuspread3d_blockgather_prop(nf1, nf2, nf3, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return ier;
- }
- }
- if (d_plan->opts.gpu_method == 1) {
- ier = cuspread3d_nuptsdriven_prop(nf1, nf2, nf3, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return ier;
- }
- }
- if (d_plan->opts.gpu_method == 2) {
- int ier = cuspread3d_subprob_prop(nf1, nf2, nf3, M, d_plan);
- if (ier != 0) {
- printf("error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
-
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
- return ier;
- }
- }
+ if (d_plan->opts.gpu_method == 1 && (ier = cuspread3d_nuptsdriven_prop(nf1, nf2, nf3, M, d_plan)))
+ fprintf(stderr, "error: cuspread3d_nuptsdriven_prop, method(%d)\n", d_plan->opts.gpu_method);
+ if (d_plan->opts.gpu_method == 2 && (ier = cuspread3d_subprob_prop(nf1, nf2, nf3, M, d_plan)))
+ fprintf(stderr, "error: cuspread3d_subprob_prop, method(%d)\n", d_plan->opts.gpu_method);
+ if (d_plan->opts.gpu_method == 4 && (ier = cuspread3d_blockgather_prop(nf1, nf2, nf3, M, d_plan)))
+ fprintf(stderr, "error: cuspread3d_blockgather_prop, method(%d)\n", d_plan->opts.gpu_method);
} break;
}
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
- return 0;
+ return ier;
}
template
@@ -514,11 +487,7 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinuff
Melody Shih 07/25/19; Barnett 2/16/21.
*/
{
- // Mult-GPU support: set the CUDA Device ID:
- int orig_gpu_device_id;
- cudaGetDevice(&orig_gpu_device_id);
- cudaSetDevice(d_plan->opts.gpu_device_id);
-
+ cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
int ier;
int type = d_plan->type;
switch (d_plan->dim) {
@@ -529,7 +498,7 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinuff
ier = cufinufft1d2_exec(d_c, d_fk, d_plan);
if (type == 3) {
std::cerr << "Not Implemented yet" << std::endl;
- ier = 1;
+ ier = FINUFFT_ERR_TYPE_NOTVALID;
}
} break;
case 2: {
@@ -539,7 +508,7 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinuff
ier = cufinufft2d2_exec(d_c, d_fk, d_plan);
if (type == 3) {
std::cerr << "Not Implemented yet" << std::endl;
- ier = 1;
+ ier = FINUFFT_ERR_TYPE_NOTVALID;
}
} break;
case 3: {
@@ -549,14 +518,11 @@ int cufinufft_execute_impl(cuda_complex *d_c, cuda_complex *d_fk, cufinuff
ier = cufinufft3d2_exec(d_c, d_fk, d_plan);
if (type == 3) {
std::cerr << "Not Implemented yet" << std::endl;
- ier = 1;
+ ier = FINUFFT_ERR_TYPE_NOTVALID;
}
} break;
}
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
-
return ier;
}
@@ -572,41 +538,21 @@ int cufinufft_destroy_impl(cufinufft_plan_t *d_plan)
Also see ../docs/cppdoc.md for main user-facing documentation.
*/
{
- // Mult-GPU support: set the CUDA Device ID:
- int orig_gpu_device_id;
- cudaGetDevice(&orig_gpu_device_id);
- cudaSetDevice(d_plan->opts.gpu_device_id);
-
- // Can't destroy a Null pointer.
- if (!d_plan) {
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
- return 1;
- }
+ cufinufft::utils::WithCudaDevice device_swapper(d_plan->opts.gpu_device_id);
+
+ // Can't destroy a null pointer.
+ if (!d_plan)
+ return FINUFFT_ERR_PLAN_NOTVALID;
if (d_plan->fftplan)
cufftDestroy(d_plan->fftplan);
using namespace cufinufft::memtransfer;
- switch (d_plan->dim) {
- case 1: {
- freegpumemory1d(d_plan);
- } break;
- case 2: {
- freegpumemory2d(d_plan);
- } break;
- case 3: {
- freegpumemory3d(d_plan);
- } break;
- }
+ freegpumemory(d_plan);
/* free/destruct the plan */
delete d_plan;
- /* set pointer to NULL now that we've hopefully free'd the memory. */
- d_plan = NULL;
- // Multi-GPU support: reset the device ID
- cudaSetDevice(orig_gpu_device_id);
return 0;
} // namespace cufinufft
#endif
diff --git a/include/cufinufft/memtransfer.h b/include/cufinufft/memtransfer.h
index d9a229ae0..382f911e9 100644
--- a/include/cufinufft/memtransfer.h
+++ b/include/cufinufft/memtransfer.h
@@ -11,19 +11,15 @@ int allocgpumem1d_plan(cufinufft_plan_t *d_plan);
template
int allocgpumem1d_nupts(cufinufft_plan_t *d_plan);
template
-void freegpumemory1d(cufinufft_plan_t *d_plan);
+void freegpumemory(cufinufft_plan_t *d_plan);
template
int allocgpumem2d_plan(cufinufft_plan_t *d_plan);
template
int allocgpumem2d_nupts(cufinufft_plan_t *d_plan);
template
-void freegpumemory2d(cufinufft_plan_t *d_plan);
-template
int allocgpumem3d_plan(cufinufft_plan_t *d_plan);
template
int allocgpumem3d_nupts(cufinufft_plan_t *d_plan);
-template
-void freegpumemory3d(cufinufft_plan_t *d_plan);
} // namespace memtransfer
} // namespace cufinufft
diff --git a/include/cufinufft/types.h b/include/cufinufft/types.h
index 6f46e750a..fea996a9a 100644
--- a/include/cufinufft/types.h
+++ b/include/cufinufft/types.h
@@ -26,7 +26,6 @@ struct cuda_complex_impl {
template
using cuda_complex = typename cuda_complex_impl::type;
-
template
struct cufinufft_plan_t {
cufinufft_opts opts;
@@ -46,7 +45,6 @@ struct cufinufft_plan_t {
int iflag;
int totalnumsubprob;
- int byte_now;
T *fwkerhalf1;
T *fwkerhalf2;
T *fwkerhalf3;
@@ -72,10 +70,8 @@ struct cufinufft_plan_t {
int *subprob_to_nupts;
cufftHandle fftplan;
- cudaStream_t *streams;
};
-
template
static cufftType_t cufft_type();
template <>
diff --git a/include/cufinufft/utils.h b/include/cufinufft/utils.h
index 35c70428d..e8deb42e9 100644
--- a/include/cufinufft/utils.h
+++ b/include/cufinufft/utils.h
@@ -8,6 +8,8 @@
#include
#include
+#include
+
#include
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__)
@@ -30,6 +32,18 @@ __inline__ __device__ double atomicAdd(double *address, double val) {
namespace cufinufft {
namespace utils {
+class WithCudaDevice {
+ public:
+ WithCudaDevice(int device) {
+ cudaGetDevice(&orig_device_);
+ cudaSetDevice(device);
+ }
+
+ ~WithCudaDevice() { cudaSetDevice(orig_device_); }
+
+ private:
+ int orig_device_;
+};
// jfm timer class
class CNTime {
diff --git a/include/cufinufft_errors.h b/include/cufinufft_errors.h
deleted file mode 100644
index 982ca0c82..000000000
--- a/include/cufinufft_errors.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef __CUFINUFFT_ERRORS_H__
-#define __CUFINUFFT_ERRORS_H__
-#include
-
-// For error checking
-static const char *_cufftGetErrorEnum(cufftResult error) {
- switch (error) {
- case CUFFT_SUCCESS:
- return "cufft_success";
- case CUFFT_INVALID_PLAN:
- return "cufft_invalid_plan";
- case CUFFT_ALLOC_FAILED:
- return "cufft_alloc_failed";
- case CUFFT_INVALID_TYPE:
- return "cufft_invalid_type";
- case CUFFT_INVALID_VALUE:
- return "cufft_invalid_value";
- case CUFFT_INTERNAL_ERROR:
- return "cufft_internal_error";
- case CUFFT_EXEC_FAILED:
- return "cufft_exec_failed";
- case CUFFT_SETUP_FAILED:
- return "cufft_setup_failed";
- case CUFFT_INVALID_SIZE:
- return "cufft_invalid_size";
- case CUFFT_UNALIGNED_DATA:
- return "cufft_unaligned data";
- case CUFFT_INCOMPLETE_PARAMETER_LIST:
- return "cufft_incomplete_parameter_list";
- case CUFFT_INVALID_DEVICE:
- return "cufft_invalid_device";
- case CUFFT_PARSE_ERROR:
- return "cufft_parse_error";
- case CUFFT_NO_WORKSPACE:
- return "cufft_no_workspace";
- case CUFFT_NOT_IMPLEMENTED:
- return "cufft_not_implemented";
- case CUFFT_LICENSE_ERROR:
- return "cufft_license_error";
- case CUFFT_NOT_SUPPORTED:
- return "cufft_not_supported";
- }
- return "";
-}
-
-#endif
diff --git a/include/finufft.h b/include/finufft.h
index 61754564c..d0a42ee89 100644
--- a/include/finufft.h
+++ b/include/finufft.h
@@ -29,6 +29,9 @@
#include
#include
+// Public error numbers
+#include
+
// octave (mkoctfile) needs this otherwise it doesn't know what int64_t is!
#include
#define FINUFFT_BIGINT int64_t
diff --git a/include/finufft/defs.h b/include/finufft/defs.h
index c6be287c9..77bc69b6b 100644
--- a/include/finufft/defs.h
+++ b/include/finufft/defs.h
@@ -20,27 +20,6 @@
#include
-// ---------- Global error/warning output codes for the library ---------------
-// (it could be argued these belong in finufft.h, but to avoid polluting
-// user's name space we keep them here)
-// NB: if change these numbers, also must regen test/results/dumbinputs.refout
-#define WARN_EPS_TOO_SMALL 1
-// this means that a fine grid array dim exceeded MAX_NF; no malloc tried...
-#define ERR_MAXNALLOC 2
-#define ERR_SPREAD_BOX_SMALL 3
-#define ERR_SPREAD_PTS_OUT_RANGE 4
-#define ERR_SPREAD_ALLOC 5
-#define ERR_SPREAD_DIR 6
-#define ERR_UPSAMPFAC_TOO_SMALL 7
-#define ERR_HORNER_WRONG_BETA 8
-#define ERR_NTRANS_NOTVALID 9
-#define ERR_TYPE_NOTVALID 10
-// some generic internal allocation failure...
-#define ERR_ALLOC 11
-#define ERR_DIM_NOTVALID 12
-#define ERR_SPREAD_THREAD_NOTVALID 13
-
-
// --------------- Private data types for compilation in either prec ---------
// Devnote: must match those in relevant prec of public finufft.h interface!
@@ -63,7 +42,7 @@
// ------------- Library-wide algorithm parameter settings ----------------
// Library version (is a string)
-#define FINUFFT_VER "2.1.0"
+#define FINUFFT_VER "2.2.0.dev0"
// Largest possible kernel spread width per dimension, in fine grid points
// (used only in spreadinterp.cpp)
diff --git a/include/finufft/fftw_defs.h b/include/finufft/fftw_defs.h
index 5d8f5406f..89d86f0de 100644
--- a/include/finufft/fftw_defs.h
+++ b/include/finufft/fftw_defs.h
@@ -45,10 +45,4 @@
#define FFTW_CLEANUP_THREADS()
#endif
-#ifdef FFTW_PLAN_SAFE
- #define FFTW_PLAN_SF() FFTWIFY(make_planner_thread_safe())
-#else
- #define FFTW_PLAN_SF()
-#endif
-
#endif // FFTW_DEFS_H
diff --git a/include/finufft_errors.h b/include/finufft_errors.h
new file mode 100644
index 000000000..74dc8cc21
--- /dev/null
+++ b/include/finufft_errors.h
@@ -0,0 +1,29 @@
+#ifndef FINUFFT_ERRORS_H
+#define FINUFFT_ERRORS_H
+
+// ---------- Global error/warning output codes for the library ---------------
+// NB: if change these numbers, also must regen test/results/dumbinputs.refout
+#define FINUFFT_WARN_EPS_TOO_SMALL 1
+// this means that a fine grid array dim exceeded MAX_NF; no malloc tried...
+#define FINUFFT_ERR_MAXNALLOC 2
+#define FINUFFT_ERR_SPREAD_BOX_SMALL 3
+#define FINUFFT_ERR_SPREAD_PTS_OUT_RANGE 4
+#define FINUFFT_ERR_SPREAD_ALLOC 5
+#define FINUFFT_ERR_SPREAD_DIR 6
+#define FINUFFT_ERR_UPSAMPFAC_TOO_SMALL 7
+#define FINUFFT_ERR_HORNER_WRONG_BETA 8
+#define FINUFFT_ERR_NTRANS_NOTVALID 9
+#define FINUFFT_ERR_TYPE_NOTVALID 10
+// some generic internal allocation failure...
+#define FINUFFT_ERR_ALLOC 11
+#define FINUFFT_ERR_DIM_NOTVALID 12
+#define FINUFFT_ERR_SPREAD_THREAD_NOTVALID 13
+#define FINUFFT_ERR_NDATA_NOTVALID 14
+// cuda malloc/memset/kernel failure/etc
+#define FINUFFT_ERR_CUDA_FAILURE 15
+#define FINUFFT_ERR_PLAN_NOTVALID 16
+#define FINUFFT_ERR_METHOD_NOTVALID 17
+#define FINUFFT_ERR_BINSIZE_NOTVALID 18
+#define FINUFFT_ERR_INSUFFICIENT_SHMEM 19
+
+#endif
diff --git a/make.inc.macosx_arm64 b/make.inc.macosx_arm64
index 88485755c..8889fb964 100644
--- a/make.inc.macosx_arm64
+++ b/make.inc.macosx_arm64
@@ -23,10 +23,10 @@ CXX=clang++
CC=clang
# taken from makefile...
-CFLAGS += -I include -I/usr/local/include -I/opt/homebrew/include
+CFLAGS += -I include -I/usr/local/include -I/opt/homebrew/include -I/opt/homebrew/opt/libomp/include
FFLAGS = $(CFLAGS)
CXXFLAGS = $(CFLAGS)
-LIBS += -L/usr/local/lib -L/opt/homebrew/lib
+LIBS += -L/usr/local/lib -L/opt/homebrew/lib -L/opt/homebrew/opt/libomp/lib
# OpenMP with clang needs following...
OMPFLAGS = -Xpreprocessor -fopenmp
diff --git a/make.inc.macosx_gcc-12 b/make.inc.macosx_gcc-12
new file mode 100644
index 000000000..e274a4d53
--- /dev/null
+++ b/make.inc.macosx_gcc-12
@@ -0,0 +1,49 @@
+# Makefile variable overrides for Mac OSX compilation with GCC v.12
+#
+# Use this if you'll need to link against gfortran.
+#
+# Barnett 10/27/18. Input from Yu-Hsuan Shih, Amit Moskovich.
+# Lu minor modification for gcc-10 12/06/2020
+# Leslie and Dan F found XCode linking problem, workaround 10/5/23.
+
+# By default we use clang/LLVM (which is aliased to /usr/lib/gcc, etc).
+# This make.inc is if you want to override this.
+# Get gcc from brew then use, eg:
+CXX=g++-12
+CC=gcc-12
+FC=gfortran
+
+# as in makefile, but with the brew /usr/local/ stuff...
+CFLAGS += -I src -I/usr/local/include -I/opt/homebrew/include
+FFLAGS = $(CFLAGS)
+CXXFLAGS = $(CFLAGS)
+LIBS += -L/usr/local/lib -L/opt/homebrew/lib
+# Workaround to force old linker to be used due to XCode15 (Issue #360):
+LDFLAGS+=-ld64
+
+# If you're getting warning messages of the form:
+# ld: warning: object file (lib-static/libfinufft.a(finufft1d.o)) was built for
+# newer OSX version (10.13) than being linked (10.9)
+# Then you can uncomment the following two lines with the older version number
+# (in this example -mmacosx-version-min=10.9)
+#
+#CFLAGS += "-mmacosx-version-min="
+
+# OpenMP with GCC on OSX needs following...
+OMPFLAGS = -fopenmp
+OMPLIBS = -L/usr/local/lib -lgomp
+# since fftw3_omp doesn't work in OSX, we need...
+FFTWOMPSUFFIX=threads
+
+# MATLAB interface:
+# some of these will depend on your FFTW library location...
+MFLAGS += -I/usr/local/include -I/opt/homebrew/include -L/usr/local/lib -L/opt/homebrew/lib -lm
+# edit for your MATLAB version location...
+MEX = $(shell ls -d /Applications/MATLAB_R20**.app)/bin/mex
+# Also see docs/install.rst for possible edits to MATLAB's MEX XML file.
+
+# If you have segfault of MATLAB then please try the following:
+#MOMPFLAGS = -D_OPENMP
+#OMPFLAGS = -Xpreprocessor -fopenmp
+#OMPLIBS = $(shell ls -d /Applications/MATLAB_R20**.app)/sys/os/maci64/libiomp5.dylib
+# This links to MATLAB's omp not gomp.
diff --git a/makefile b/makefile
index 6bfbe0f90..96a7a596a 100644
--- a/makefile
+++ b/makefile
@@ -29,8 +29,6 @@ PYTHON = python3
CFLAGS := -O3 -funroll-loops -march=native -fcx-limited-range $(CFLAGS)
FFLAGS := $(CFLAGS) $(FFLAGS)
CXXFLAGS := $(CFLAGS) $(CXXFLAGS)
-# put this in your make.inc if you have FFTW>=3.3.5 and want thread-safe use...
-#CXXFLAGS += -DFFTW_PLAN_SAFE
# FFTW base name, and math linking...
FFTWNAME = fftw3
# linux default is fftw3_omp, since 10% faster than fftw3_threads...
@@ -192,10 +190,6 @@ endif
# examples (C++/C) -----------------------------------------------------------
# build all examples (single-prec codes separate, and not all have one)...
EXAMPLES = $(basename $(wildcard examples/*.c examples/*.cpp))
-# ...except only build threadsafe ones if user switch on (thus FFTW>=3.3.6):
-ifeq (,$(findstring FFTW_PLAN_SAFE,$(CXXFLAGS)))
- EXAMPLES := $(filter-out %/threadsafe1d1 %/threadsafe2d2f, $(EXAMPLES))
-endif
examples: $(EXAMPLES)
ifneq ($(MINGW),ON)
# Windows-MSYS does not find the dynamic libraries, so we make a temporary copy
@@ -442,7 +436,7 @@ objclean:
ifneq ($(MINGW),ON)
# non-Windows-WSL...
rm -f src/*.o test/directft/*.o test/*.o examples/*.o matlab/*.o contrib/*.o
- rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o
+ rm -f fortran/*.o $(FE_DIR)/*.o $(FD)/*.o finufft_mod.mod
else
# Windows-WSL...
for /d %%d in (src,test\directfttest,examples,matlab,contrib) do (for %%f in (%%d\*.o) do (del %%f))
diff --git a/matlab/Contents.m b/matlab/Contents.m
index 0cd218896..3a336d729 100644
--- a/matlab/Contents.m
+++ b/matlab/Contents.m
@@ -1,5 +1,5 @@
% FINUFFT: Flatiron Institute Nonuniform Fast Fourier Transform
-% Version 2.1.0
+% Version 2.2.0.dev0
%
% Basic and many-vector interfaces
% finufft1d1 - 1D complex nonuniform FFT of type 1 (nonuniform to uniform).
diff --git a/matlab/errhandler.m b/matlab/errhandler.m
index f52731868..78410edd1 100644
--- a/matlab/errhandler.m
+++ b/matlab/errhandler.m
@@ -6,7 +6,7 @@ function errhandler(ier)
% Note that there are other matlab-only error types defined in valid_*.m
switch ier
- % These are the ERR_ #defines in ../include/defs.h:
+ % These are the ERR_ #defines in ../include/finufft_errors.h:
case 1
warning('FINUFFT:epsTooSmall','FINUFFT eps tolerance too small to achieve');
case 2
diff --git a/perftest/cuda/cuperftest.cu b/perftest/cuda/cuperftest.cu
index 12750ea4f..b2e6326ed 100644
--- a/perftest/cuda/cuperftest.cu
+++ b/perftest/cuda/cuperftest.cu
@@ -57,6 +57,7 @@ struct test_options_t {
{"N1", required_argument, 0, 0},
{"N2", required_argument, 0, 0},
{"N3", required_argument, 0, 0},
+ {"M", required_argument, 0, 0},
{"tol", required_argument, 0, 0},
{"method", required_argument, 0, 0},
{"kerevalmethod", required_argument, 0, 0},
@@ -87,7 +88,7 @@ struct test_options_t {
N[2] = std::stof(get_or(options_map, "N3", "1"));
M = std::stof(get_or(options_map, "M", "2E6"));
method = std::stoi(get_or(options_map, "method", "1"));
- kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "0"));
+ kerevalmethod = std::stoi(get_or(options_map, "kerevalmethod", "1"));
sort = std::stoi(get_or(options_map, "sort", "1"));
tol = std::stof(get_or(options_map, "tol", "1E-5"));
}
@@ -172,7 +173,7 @@ void run_test(test_options_t &test_opts) {
for (int i = 0; i < 3; ++i)
dim = test_opts.N[i] > 1 ? i + 1 : dim;
- cufinufft_default_opts(test_opts.type, dim, &opts);
+ cufinufft_default_opts(&opts);
opts.gpu_method = test_opts.method;
opts.gpu_sort = test_opts.sort;
opts.gpu_kerevalmeth = test_opts.kerevalmethod;
diff --git a/python/cufinufft/cufinufft/__init__.py b/python/cufinufft/cufinufft/__init__.py
index 730f6a693..31120bb96 100644
--- a/python/cufinufft/cufinufft/__init__.py
+++ b/python/cufinufft/cufinufft/__init__.py
@@ -3,5 +3,9 @@
from cufinufft._simple import (nufft1d1, nufft1d2, nufft2d1, nufft2d2,
nufft3d1, nufft3d2)
-__all__ = ['cufinufft']
-__version__ = '2.2.0dev0'
+__all__ = ["nufft1d1", "nufft1d2",
+ "nufft2d1", "nufft2d2",
+ "nufft3d1", "nufft3d2",
+ "Plan"]
+
+__version__ = '2.2.0.dev0'
diff --git a/python/cufinufft/cufinufft/_compat.py b/python/cufinufft/cufinufft/_compat.py
new file mode 100644
index 000000000..04e066a1a
--- /dev/null
+++ b/python/cufinufft/cufinufft/_compat.py
@@ -0,0 +1,106 @@
+import inspect
+
+import numpy as np
+
+
+def get_array_ptr(data):
+ try:
+ return data.__cuda_array_interface__['data'][0]
+ except RuntimeError:
+ # Handle torch with gradient enabled
+ # https://github.com/flatironinstitute/finufft/pull/326#issuecomment-1652212770
+ return data.data_ptr()
+ except AttributeError:
+ raise TypeError("Invalid GPU array implementation. Implementation must implement the standard cuda array interface.")
+
+
+def get_array_module(obj):
+ module_name = inspect.getmodule(type(obj)).__name__
+
+ if module_name.startswith("numba.cuda"):
+ return "numba"
+ elif module_name.startswith("torch"):
+ return "torch"
+ elif module_name.startswith("pycuda"):
+ return "pycuda"
+ else:
+ return "generic"
+
+
+def get_array_size(obj):
+ array_module = get_array_module(obj)
+
+ if array_module == "torch":
+ return len(obj)
+ else:
+ return obj.size
+
+
+def get_array_dtype(obj):
+ array_module = get_array_module(obj)
+
+ if array_module == "torch":
+ dtype_str = str(obj.dtype)
+ dtype_str = dtype_str[len("torch."):]
+ return np.dtype(dtype_str)
+ else:
+ return obj.dtype
+
+
+def is_array_contiguous(obj):
+ array_module = get_array_module(obj)
+
+ if array_module == "numba":
+ return obj.is_c_contiguous()
+ elif array_module == "torch":
+ return obj.is_contiguous()
+ else:
+ return obj.flags.c_contiguous
+
+
+def array_can_contiguous(obj):
+ array_module = get_array_module(obj)
+
+ if array_module == "pycuda":
+ return False
+ else:
+ return True
+
+
+def array_contiguous(obj):
+ array_module = get_array_module(obj)
+
+ if array_module == "numba":
+ import numba
+ ret = numba.cuda.device_array(obj.shape, obj.dtype, stream=obj.stream)
+ ret[:] = obj[:]
+ return ret
+ if array_module == "torch":
+ return obj.contiguous()
+ else:
+ return obj.copy(order="C")
+
+
+def array_empty_like(obj, *args, **kwargs):
+ module_name = get_array_module(obj)
+
+ if module_name == "numba":
+ import numba.cuda
+ return numba.cuda.device_array(*args, **kwargs)
+ elif module_name == "torch":
+ import torch
+ if "shape" in kwargs:
+ kwargs["size"] = kwargs.pop("shape")
+ if "dtype" in kwargs:
+ dtype = kwargs.pop("dtype")
+ if dtype == np.complex64:
+ dtype = torch.complex64
+ elif dtype == np.complex128:
+ dtype = torch.complex128
+ kwargs["dtype"] = dtype
+ if "device" not in kwargs:
+ kwargs["device"] = obj.device
+
+ return torch.empty(*args, **kwargs)
+ else:
+ return type(obj)(*args, **kwargs)
diff --git a/python/cufinufft/cufinufft/_cufinufft.py b/python/cufinufft/cufinufft/_cufinufft.py
index 80e83643b..3e0128ce1 100644
--- a/python/cufinufft/cufinufft/_cufinufft.py
+++ b/python/cufinufft/cufinufft/_cufinufft.py
@@ -17,8 +17,6 @@
warnings.filterwarnings("ignore", category=DeprecationWarning)
import imp
-import numpy as np
-
from ctypes import c_double
from ctypes import c_int
from ctypes import c_int64
@@ -93,8 +91,8 @@ class NufftOpts(ctypes.Structure):
NufftOpts_p = ctypes.POINTER(NufftOpts)
_default_opts = lib.cufinufft_default_opts
-_default_opts.argtypes = [c_int, c_int, NufftOpts_p]
-_default_opts.restype = c_int
+_default_opts.argtypes = [NufftOpts_p]
+_default_opts.restype = None
_make_plan = lib.cufinufft_makeplan
_make_plan.argtypes = [
diff --git a/python/cufinufft/cufinufft/_plan.py b/python/cufinufft/cufinufft/_plan.py
index 084619990..a165cbe20 100644
--- a/python/cufinufft/cufinufft/_plan.py
+++ b/python/cufinufft/cufinufft/_plan.py
@@ -25,7 +25,7 @@
from cufinufft._cufinufft import _destroy_plan
from cufinufft._cufinufft import _destroy_planf
-from pycuda.gpuarray import GPUArray
+from cufinufft import _compat
# If we are shutting down python, we don't need to run __del__
@@ -115,7 +115,7 @@ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
self._maxbatch = 1 # TODO: optimize this one day
# Get the default option values.
- self._opts = self._default_opts(nufft_type, self.dim)
+ self._opts = self._default_opts()
# Extract list of valid field names.
field_names = [name for name, _ in self._opts._fields_]
@@ -135,7 +135,7 @@ def __init__(self, nufft_type, n_modes, n_trans=1, eps=1e-6, isign=None,
self._references = []
@staticmethod
- def _default_opts(nufft_type, dim):
+ def _default_opts():
"""
Generates a cufinufft opt struct of the dtype coresponding to plan.
@@ -147,10 +147,7 @@ def _default_opts(nufft_type, dim):
nufft_opts = NufftOpts()
- ier = _default_opts(nufft_type, dim, nufft_opts)
-
- if ier != 0:
- raise RuntimeError('Configuration not yet implemented.')
+ _default_opts(nufft_opts)
return nufft_opts
@@ -206,7 +203,7 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
_x, _y, _z = _ensure_valid_pts(_x, _y, _z, self.dim)
- M = _x.size
+ M = _compat.get_array_size(_x)
# Because FINUFFT/cufinufft are internally column major,
# we will reorder the pts axes. Reordering references
@@ -217,17 +214,17 @@ def setpts(self, x, y=None, z=None, s=None, t=None, u=None):
# (x, y, None) ~> (y, x, None)
# (x, y, z) ~> (z, y, x)
# Via code, we push each dimension onto a stack of axis
- fpts_axes = [_x.ptr, None, None]
+ fpts_axes = [_compat.get_array_ptr(_x), None, None]
# We will also store references to these arrays.
# This keeps python from prematurely cleaning them up.
self._references.append(_x)
if self.dim >= 2:
- fpts_axes.insert(0, _y.ptr)
+ fpts_axes.insert(0, _compat.get_array_ptr(_y))
self._references.append(_y)
if self.dim >= 3:
- fpts_axes.insert(0, _z.ptr)
+ fpts_axes.insert(0, _compat.get_array_ptr(_z))
self._references.append(_z)
# Then take three items off the stack as our reordered axis.
@@ -278,14 +275,16 @@ def execute(self, data, out=None):
req_out_shape = batch_shape + req_out_shape
if out is None:
- _out = GPUArray(req_out_shape, dtype=self.dtype)
+ _out = _compat.array_empty_like(_data, req_out_shape, dtype=self.dtype)
else:
_out = _ensure_array_shape(_out, "out", req_out_shape)
if self.type == 1:
- ier = self._exec_plan(self._plan, data.ptr, _out.ptr)
+ ier = self._exec_plan(self._plan, _compat.get_array_ptr(_data),
+ _compat.get_array_ptr(_out))
elif self.type == 2:
- ier = self._exec_plan(self._plan, _out.ptr, data.ptr)
+ ier = self._exec_plan(self._plan, _compat.get_array_ptr(_out),
+ _compat.get_array_ptr(_data))
if ier != 0:
raise RuntimeError('Error executing plan.')
@@ -315,27 +314,21 @@ def __del__(self):
def _ensure_array_type(x, name, dtype, output=False):
if x is None:
- return GPUArray(0, dtype=dtype, order="C")
+ return None
- if x.dtype != dtype:
+ if _compat.get_array_dtype(x) != dtype:
raise TypeError(f"Argument `{name}` does not have the correct dtype: "
f"{x.dtype} was given, but {dtype} was expected.")
- if not x.flags.c_contiguous:
- if output:
+ if not _compat.is_array_contiguous(x):
+ if output or not _compat.array_can_contiguous(x):
raise TypeError(f"Argument `{name}` does not satisfy the "
f"following requirement: C")
else:
- raise TypeError(f"Argument `{name}` does not satisfy the "
- f"following requirement: C")
-
- # Ideally we'd copy the array into the correct ordering here, but
- # this does not seem possible as of pycuda 2022.2.2.
-
- # warnings.warn(f"Argument `{name}` does not satisfy the "
- # f"following requirement: C. Copying array (this may
- # reduce performance)")
- # x = gpuarray.GPUArray(x, dtype=dtype, order="C")
+ warnings.warn(f"Argument `{name}` does not satisfy the "
+ f"following requirement: C. Copying array "
+ f"(this may reduce performance)")
+ x = _compat.array_contiguous(x)
return x
@@ -354,22 +347,21 @@ def _ensure_array_shape(x, name, shape, allow_reshape=False):
else:
return x
+
def _ensure_valid_pts(x, y, z, dim):
if x.ndim != 1:
raise TypeError(f"Argument `x` must be a vector")
- M = x.size
-
if dim >= 2:
y = _ensure_array_shape(y, "y", x.shape)
if dim >= 3:
z = _ensure_array_shape(z, "z", x.shape)
- if dim < 3 and z.size > 0:
+ if dim < 3 and z is not None and _compat.get_array_size(z) > 0:
raise TypeError(f"Plan dimension is {dim}, but `z` was specified")
- if dim < 2 and y.size > 0:
+ if dim < 2 and y is not None and _compat.get_array_size(y) > 0:
raise TypeError(f"Plan dimension is {dim}, but `y` was specified")
return x, y, z
diff --git a/python/cufinufft/cufinufft/_simple.py b/python/cufinufft/cufinufft/_simple.py
index 587868a0d..ac36e90ab 100644
--- a/python/cufinufft/cufinufft/_simple.py
+++ b/python/cufinufft/cufinufft/_simple.py
@@ -1,25 +1,30 @@
-from cufinufft import Plan
+from cufinufft import Plan, _compat
-def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1):
- return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes)
+def nufft1d1(x, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+ return _invoke_plan(1, 1, x, None, None, data, out, isign, eps, n_modes,
+ kwargs)
-def nufft1d2(x, data, out=None, eps=1e-6, isign=-1):
- return _invoke_plan(1, 2, x, None, None, data, out, isign, eps)
+def nufft1d2(x, data, out=None, eps=1e-6, isign=-1, **kwargs):
+ return _invoke_plan(1, 2, x, None, None, data, out, isign, eps, None,
+ kwargs)
-def nufft2d1(x, y, data, n_modes=None, out=None, eps=1e-6, isign=1):
- return _invoke_plan(2, 1, x, y, None, data, out, isign, eps, n_modes)
+def nufft2d1(x, y, data, n_modes=None, out=None, eps=1e-6, isign=1, **kwargs):
+ return _invoke_plan(2, 1, x, y, None, data, out, isign, eps, n_modes,
+ kwargs)
-def nufft2d2(x, y, data, out=None, eps=1e-6, isign=-1):
- return _invoke_plan(2, 2, x, y, None, data, out, isign, eps)
+def nufft2d2(x, y, data, out=None, eps=1e-6, isign=-1, **kwargs):
+ return _invoke_plan(2, 2, x, y, None, data, out, isign, eps, None, kwargs)
-def nufft3d1(x, y, z, data, n_modes=None, out=None, eps=1e-6, isign=1):
- return _invoke_plan(3, 1, x, y, z, data, out, isign, eps, n_modes)
+def nufft3d1(x, y, z, data, n_modes=None, out=None, eps=1e-6, isign=1,
+ **kwargs):
+ return _invoke_plan(3, 1, x, y, z, data, out, isign, eps, n_modes, kwargs)
-def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1):
- return _invoke_plan(3, 2, x, y, z, data, out, isign, eps)
+def nufft3d2(x, y, z, data, out=None, eps=1e-6, isign=-1, **kwargs):
+ return _invoke_plan(3, 2, x, y, z, data, out, isign, eps, None, kwargs)
-def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps, n_modes=None):
- dtype = data.dtype
+def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps,
+ n_modes=None, kwargs=None):
+ dtype = _compat.get_array_dtype(data)
n_trans = _get_ntrans(dim, nufft_type, data)
@@ -28,7 +33,7 @@ def _invoke_plan(dim, nufft_type, x, y, z, data, out, isign, eps, n_modes=None):
if nufft_type == 2:
n_modes = data.shape[-dim:]
- plan = Plan(nufft_type, n_modes, n_trans, eps, isign, dtype)
+ plan = Plan(nufft_type, n_modes, n_trans, eps, isign, dtype, **kwargs)
plan.setpts(x, y, z)
diff --git a/python/cufinufft/examples/example2d1many.py b/python/cufinufft/examples/example2d1_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d1many.py
rename to python/cufinufft/examples/example2d1_pycuda.py
diff --git a/python/cufinufft/examples/example2d2many.py b/python/cufinufft/examples/example2d2_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/example2d2many.py
rename to python/cufinufft/examples/example2d2_pycuda.py
diff --git a/python/cufinufft/examples/getting_started.py b/python/cufinufft/examples/getting_started_pycuda.py
similarity index 100%
rename from python/cufinufft/examples/getting_started.py
rename to python/cufinufft/examples/getting_started_pycuda.py
diff --git a/python/cufinufft/requirements.txt b/python/cufinufft/requirements.txt
index fcbec6659..bc2cbbd1c 100644
--- a/python/cufinufft/requirements.txt
+++ b/python/cufinufft/requirements.txt
@@ -1,3 +1,2 @@
numpy
-pycuda
six
diff --git a/python/cufinufft/setup.py b/python/cufinufft/setup.py
index b1d63cd29..129e5d864 100644
--- a/python/cufinufft/setup.py
+++ b/python/cufinufft/setup.py
@@ -37,7 +37,7 @@
# Python Package Setup
setup(
name='cufinufft',
- version='2.2.0dev0',
+ version='2.2.0.dev0',
author='Yu-shuan Melody Shih, Garrett Wright, Joakim Anden, Johannes Blaschke, Alex Barnett',
author_email='janden-vscholar@flatironinstitute.org',
url='https://github.com/flatironinstitute/cufinufft',
diff --git a/python/cufinufft/tests/conftest.py b/python/cufinufft/tests/conftest.py
new file mode 100644
index 000000000..56528681f
--- /dev/null
+++ b/python/cufinufft/tests/conftest.py
@@ -0,0 +1,24 @@
+import pytest
+
+import utils
+
+
+def pytest_addoption(parser):
+ parser.addoption("--framework", action="append", default=[], help="List of frameworks")
+
+def pytest_generate_tests(metafunc):
+ if "framework" in metafunc.fixturenames:
+ metafunc.parametrize("framework", metafunc.config.getoption("framework"))
+
+@pytest.fixture
+def to_gpu(framework):
+ to_gpu, _ = utils.transfer_funcs(framework)
+
+ return to_gpu
+
+
+@pytest.fixture
+def to_cpu(framework):
+ _, to_cpu = utils.transfer_funcs(framework)
+
+ return to_cpu
diff --git a/python/cufinufft/tests/test_array_ordering.py b/python/cufinufft/tests/test_array_ordering.py
index 61d704371..0fba8f8f5 100644
--- a/python/cufinufft/tests/test_array_ordering.py
+++ b/python/cufinufft/tests/test_array_ordering.py
@@ -2,39 +2,26 @@
import numpy as np
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
import utils
-def test_type2_ordering(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
- complex_dtype = utils._complex_dtype(dtype)
-
- k = utils.gen_nu_pts(M).astype(dtype)
- fk = utils.gen_uniform_data(shape).astype(complex_dtype)
- fkTT = fk.T.copy().T
-
- k_gpu = gpuarray.to_gpu(k)
- fk_gpu = gpuarray.to_gpu(fk)
- fkTT_gpu = gpuarray.to_gpu(fkTT)
+def test_type1_ordering(to_gpu, to_cpu, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+ complex_dtype = utils._complex_dtype(dtype)
- plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
+ k, c = utils.type1_problem(dtype, shape, M)
- plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+ k_gpu = to_gpu(k)
+ c_gpu = to_gpu(c)
- c_gpu = plan.execute(fk_gpu)
+ plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
- with pytest.raises(TypeError) as err:
- cTT_gpu = plan.execute(fkTT_gpu)
- assert "following requirement: C" in err.value.args[0]
+ plan.setpts(*k_gpu)
- # Ideally, it should be possible to get this to align with true output,
- # but corrently does not look like it.
+ out = np.empty(shape, dtype=complex_dtype, order="F")
- # c = c_gpu.get()
- # cTT = cTT_gpu.get()
+ out_gpu = to_gpu(out)
- # assert np.allclose(c, cTT, rtol=1e-2)
+ with pytest.raises(TypeError, match="following requirement: C") as err:
+ plan.execute(c_gpu, out=out_gpu)
diff --git a/python/cufinufft/tests/test_basic.py b/python/cufinufft/tests/test_basic.py
index aa12b19ab..f4bc5713a 100644
--- a/python/cufinufft/tests/test_basic.py
+++ b/python/cufinufft/tests/test_basic.py
@@ -1,118 +1,112 @@
-import numpy as np
+import pytest
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
+import numpy as np
-from cufinufft import Plan
+from cufinufft import Plan, _compat
import utils
+# NOTE: Tests below fail for tolerance 1e-4 (error executing plan).
+
+DTYPES = [np.float32, np.float64]
+SHAPES = [(16,), (16, 16), (16, 16, 16)]
+MS = [256, 1024, 4096]
+TOLS = [1e-3, 1e-6]
+OUTPUT_ARGS = [False, True]
+CONTIGUOUS = [False, True]
-def _test_type1(dtype, shape=(16, 16, 16), M=4096, tol=1e-3, output_arg=True):
- complex_dtype = utils._complex_dtype(dtype)
- dim = len(shape)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("M", MS)
+@pytest.mark.parametrize("tol", TOLS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_type1(to_gpu, to_cpu, dtype, shape, M, tol, output_arg):
+ complex_dtype = utils._complex_dtype(dtype)
- k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
- c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+ k, c = utils.type1_problem(dtype, shape, M)
- k_gpu = gpuarray.to_gpu(k)
- c_gpu = gpuarray.to_gpu(c)
+ k_gpu = to_gpu(k)
+ c_gpu = to_gpu(c)
plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
- plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+ # Since k_gpu is an array of shape (dim, M), this will expand to
+ # plan.setpts(k_gpu[0], ..., k_gpu[dim]), allowing us to handle all
+ # dimensions with the same call.
+ plan.setpts(*k_gpu)
if output_arg:
- fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+ fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
plan.execute(c_gpu, out=fk_gpu)
else:
fk_gpu = plan.execute(c_gpu)
- fk = fk_gpu.get()
-
- ind = int(0.1789 * np.prod(shape))
-
- fk_est = fk.ravel()[ind]
- fk_target = utils.direct_type1(c, k, shape, ind)
+ fk = to_cpu(fk_gpu)
- type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
+ utils.verify_type1(k, c, fk, tol)
- print('Type 1 relative error:', type1_rel_err)
- assert type1_rel_err < 0.01
-
-
-def test_type1_32(shape=(16, 16, 16), M=4096, tol=1e-3):
- _test_type1(dtype=np.float32, shape=shape, M=M, tol=tol,
- output_arg=True)
- _test_type1(dtype=np.float32, shape=shape, M=M, tol=tol,
- output_arg=False)
-
-
-def test_type1_64(shape=(16, 16, 16), M=4096, tol=1e-3):
- _test_type1(dtype=np.float64, shape=shape, M=M, tol=tol,
- output_arg=True)
- _test_type1(dtype=np.float64, shape=shape, M=M, tol=tol,
- output_arg=False)
-
-
-def _test_type2(dtype, shape=(16, 16, 16), M=4096, tol=1e-3, output_arg=True):
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("M", MS)
+@pytest.mark.parametrize("tol", TOLS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+@pytest.mark.parametrize("contiguous", CONTIGUOUS)
+def test_type2(to_gpu, to_cpu, dtype, shape, M, tol, output_arg, contiguous):
complex_dtype = utils._complex_dtype(dtype)
- k = utils.gen_nu_pts(M).astype(dtype)
- fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
- k_gpu = gpuarray.to_gpu(k)
- fk_gpu = gpuarray.to_gpu(fk)
+ k, fk = utils.type2_problem(dtype, shape, M)
plan = Plan(2, shape, eps=tol, dtype=complex_dtype)
- plan.setpts(k_gpu[0], k_gpu[1], k_gpu[2])
+ check_result = True
- if output_arg:
- c_gpu = gpuarray.GPUArray(shape=(M,), dtype=complex_dtype)
- plan.execute(fk_gpu, out=c_gpu)
- else:
- c_gpu = plan.execute(fk_gpu)
-
- c = c_gpu.get()
-
- ind = M // 2
+ if not contiguous and len(shape) > 1:
+ fk = fk.copy(order="F")
- c_est = c[ind]
- c_target = utils.direct_type2(fk, k[:, ind])
+ if _compat.array_can_contiguous(to_gpu(np.empty(1))):
+ def _execute(*args, **kwargs):
+ with pytest.warns(UserWarning, match="requirement: C. Copying"):
+ return plan.execute(*args, **kwargs)
+ else:
+ check_result = False
- type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)
+ def _execute(*args, **kwargs):
+ with pytest.raises(TypeError, match="requirement: C"):
+ plan.execute(*args, **kwargs)
- print('Type 2 relative error:', type2_rel_err)
+ else:
+ def _execute(*args, **kwargs):
+ return plan.execute(*args, **kwargs)
- assert type2_rel_err < 0.01
+ k_gpu = to_gpu(k)
+ fk_gpu = to_gpu(fk)
+ plan.setpts(*k_gpu)
-def test_type2_32(shape=(16, 16, 16), M=4096, tol=1e-3):
- _test_type2(dtype=np.float32, shape=shape, M=M, tol=tol, output_arg=True)
- _test_type2(dtype=np.float32, shape=shape, M=M, tol=tol, output_arg=False)
+ if output_arg:
+ c_gpu = _compat.array_empty_like(fk_gpu, (M,), dtype=complex_dtype)
+ _execute(fk_gpu, out=c_gpu)
+ else:
+ c_gpu = _execute(fk_gpu)
+ if check_result:
+ c = to_cpu(c_gpu)
-def test_type2_64(shape=(16, 16, 16), M=4096, tol=1e-3):
- _test_type2(dtype=np.float64, shape=shape, M=M, tol=tol, output_arg=True)
- _test_type2(dtype=np.float64, shape=shape, M=M, tol=tol, output_arg=False)
+ utils.verify_type2(k, fk, c, tol)
-def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
+def test_opts(to_gpu, to_cpu, shape=(8, 8, 8), M=32, tol=1e-3):
dtype = np.float32
complex_dtype = utils._complex_dtype(dtype)
- dim = len(shape)
-
- k = utils.gen_nu_pts(M, dim=dim).astype(dtype)
- c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+ k, c = utils.type1_problem(dtype, shape, M)
- k_gpu = gpuarray.to_gpu(k)
- c_gpu = gpuarray.to_gpu(c)
- fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+ k_gpu = to_gpu(k)
+ c_gpu = to_gpu(c)
+ fk_gpu = _compat.array_empty_like(c_gpu, shape, dtype=complex_dtype)
plan = Plan(1, shape, eps=tol, dtype=complex_dtype, gpu_sort=False,
gpu_maxsubprobsize=10)
@@ -121,24 +115,6 @@ def test_opts(shape=(8, 8, 8), M=32, tol=1e-3):
plan.execute(c_gpu, fk_gpu)
- fk = fk_gpu.get()
-
- ind = int(0.1789 * np.prod(shape))
-
- fk_est = fk.ravel()[ind]
- fk_target = utils.direct_type1(c, k, shape, ind)
-
- type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
-
- assert type1_rel_err < 0.01
-
-
-def main():
- test_type1_32()
- test_type2_32()
- test_type1_64()
- test_type2_64()
-
+ fk = to_cpu(fk_gpu)
-if __name__ == '__main__':
- main()
+ utils.verify_type1(k, c, fk, tol)
diff --git a/python/cufinufft/tests/test_error_checks.py b/python/cufinufft/tests/test_error_checks.py
index ca4b42415..6a9a6b4aa 100644
--- a/python/cufinufft/tests/test_error_checks.py
+++ b/python/cufinufft/tests/test_error_checks.py
@@ -1,15 +1,11 @@
import numpy as np
import pytest
-import pycuda.autoinit # NOQA:401
-import pycuda.gpuarray as gpuarray
-
-from cufinufft import Plan
+from cufinufft import Plan, _compat
import utils
-
-def test_set_nu_raises_on_dtype():
+def test_set_nu_raises_on_dtype(to_gpu):
dtype = np.complex64
M = 4096
@@ -19,10 +15,10 @@ def test_set_nu_raises_on_dtype():
kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
- kxyz_gpu = gpuarray.to_gpu(kxyz)
+ kxyz_gpu = to_gpu(kxyz)
# Here we'll intentionally contruct an incorrect array dtype.
- kxyz_gpu_wrong_type = gpuarray.to_gpu(kxyz.astype(np.float64))
+ kxyz_gpu_wrong_type = to_gpu(kxyz.real.astype(np.float64))
plan = Plan(1, shape, eps=tol, dtype=dtype)
@@ -40,7 +36,7 @@ def test_set_nu_raises_on_dtype():
kxyz_gpu_wrong_type[1], kxyz_gpu_wrong_type[2])
-def test_set_pts_raises_on_size():
+def test_set_pts_raises_on_size(to_gpu):
dtype = np.float32
complex_dtype = np.complex64
@@ -51,26 +47,68 @@ def test_set_pts_raises_on_size():
kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
- kxyz_gpu = gpuarray.to_gpu(kxyz)
+ kxyz_gpu = to_gpu(kxyz)
plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
- with pytest.raises(TypeError) as err:
+ with pytest.raises(TypeError, match="`y` must be of shape") as err:
plan.setpts(kxyz_gpu[0], kxyz_gpu[1][:4])
- assert '`y` must be of shape' in err.value.args[0]
- with pytest.raises(TypeError) as err:
+ with pytest.raises(TypeError, match="`z` must be of shape") as err:
plan.setpts(kxyz_gpu[0], kxyz_gpu[1], kxyz_gpu[2][:4])
- assert '`z` must be of shape' in err.value.args[0]
+
+
+def test_set_pts_raises_on_nonvector(to_gpu):
+ dtype = np.float32
+ complex_dtype = np.complex64
+
+ M = 8
+ tol = 1e-3
+ shape = (16, 16, 16)
+ dim = len(shape)
+
+ kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
+
+ kxyz_gpu = to_gpu(kxyz)
+
+ plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+ with pytest.raises(TypeError, match="`x` must be a vector") as err:
+ plan.setpts(kxyz)
+
+
+def test_set_pts_raises_on_number_of_args(to_gpu):
+ dtype = np.float32
+ complex_dtype = np.complex64
+
+ M = 8
+ tol = 1e-3
+ shape = (16,)
+ dim = len(shape)
+
+ kxyz = utils.gen_nu_pts(M, dim=3).astype(dtype)
+
+ kxyz_gpu = to_gpu(kxyz)
+
+ plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+ with pytest.raises(TypeError, match="is 1, but `y` was specified") as err:
+ plan.setpts(*kxyz_gpu[:2])
+
+ shape = (16, 16)
+
+ plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
+
+ with pytest.raises(TypeError, match="is 2, but `z` was specified") as err:
+ plan.setpts(*kxyz_gpu)
def test_wrong_field_names():
- with pytest.raises(TypeError) as err:
+ with pytest.raises(TypeError, match="Invalid option 'foo'") as err:
plan = Plan(1, (8, 8), foo="bar")
- assert "Invalid option 'foo'" in err.value.args[0]
-def test_exec_raises_on_dtype():
+def test_exec_raises_on_dtype(to_gpu):
dtype = np.float32
complex_dtype = np.complex64
@@ -81,14 +119,17 @@ def test_exec_raises_on_dtype():
kxyz = utils.gen_nu_pts(M, dim=dim).astype(dtype)
c = utils.gen_nonuniform_data(M).astype(complex_dtype)
- c_gpu = gpuarray.to_gpu(c)
+ c_gpu = to_gpu(c)
# Using c.real gives us wrong dtype here...
- c_gpu_wrong_dtype = gpuarray.to_gpu(c.real)
+ # Need contiguous here since numba does not allow transfers of
+ # non-contiguous arrays.
+ c_gpu_wrong_dtype = to_gpu(np.ascontiguousarray(c.real))
- kxyz_gpu = gpuarray.to_gpu(kxyz)
- fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
+ kxyz_gpu = to_gpu(kxyz)
+ fk_gpu = _compat.array_empty_like(kxyz_gpu, shape, dtype=complex_dtype)
# Here we'll intentionally contruct an incorrect array dtype.
- fk_gpu_wrong_dtype = gpuarray.GPUArray(shape, dtype=np.complex128)
+ fk_gpu_wrong_dtype = _compat.array_empty_like(fk_gpu, shape,
+ dtype=np.complex128)
plan = Plan(1, shape, eps=tol, dtype=complex_dtype)
@@ -102,11 +143,14 @@ def test_exec_raises_on_dtype():
plan.execute(c_gpu_wrong_dtype, fk_gpu)
+def test_dtype_errors():
+ with pytest.raises(TypeError, match="Expected complex64 or complex128") as err:
+ Plan(1, (8, 8), dtype="uint8")
+
+
def test_dtype_warnings():
- with pytest.warns(DeprecationWarning) as record:
+ with pytest.warns(DeprecationWarning, match="Converting to complex64") as record:
Plan(1, (8, 8), dtype="float32")
- assert "Converting to complex64" in record[0].message.args[0]
- with pytest.warns(DeprecationWarning) as record:
+ with pytest.warns(DeprecationWarning, match="Converting to complex128") as record:
Plan(1, (8, 8), dtype="float64")
- assert "Converting to complex128" in record[0].message.args[0]
diff --git a/python/cufinufft/tests/test_examples.py b/python/cufinufft/tests/test_examples.py
index 34fe610a8..c6fb5dd45 100644
--- a/python/cufinufft/tests/test_examples.py
+++ b/python/cufinufft/tests/test_examples.py
@@ -17,5 +17,11 @@
scripts.append(os.path.join(examples_dir, filename))
@pytest.mark.parametrize("filename", scripts)
-def test_example(filename):
- subprocess.check_call([sys.executable, filename])
+def test_example(filename, request):
+ # Extract framework from format `example_framework.py`.
+ framework = Path(filename).stem.split("_")[-1]
+
+ if framework in request.config.getoption("framework"):
+ subprocess.check_call([sys.executable, filename])
+ else:
+ pytest.skip("Example not in list of frameworks")
diff --git a/python/cufinufft/tests/test_multi.py b/python/cufinufft/tests/test_multi.py
index c05810c4d..a8e392fed 100644
--- a/python/cufinufft/tests/test_multi.py
+++ b/python/cufinufft/tests/test_multi.py
@@ -1,16 +1,18 @@
import pytest
import numpy as np
-
-import pycuda.driver as drv
-import pycuda.gpuarray as gpuarray
-
from cufinufft import Plan
import utils
-def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+def test_multi_type1(framework, dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
+ if framework == "pycuda":
+ import pycuda.driver as drv
+ import pycuda.gpuarray as gpuarray
+ else:
+ pytest.skip("Multi-GPU support only tested for pycuda")
+
complex_dtype = utils._complex_dtype(dtype)
drv.init()
@@ -59,11 +61,3 @@ def test_multi_type1(dtype=np.float32, shape=(16, 16, 16), M=4096, tol=1e-3):
errs.append(type1_rel_err)
assert all(err < 0.01 for err in errs)
-
-
-def main():
- test_multi_type1()
-
-
-if __name__ == '__main__':
- main()
diff --git a/python/cufinufft/tests/test_simple.py b/python/cufinufft/tests/test_simple.py
index 906d85c27..70b602143 100644
--- a/python/cufinufft/tests/test_simple.py
+++ b/python/cufinufft/tests/test_simple.py
@@ -2,86 +2,85 @@
import numpy as np
-import pycuda.autoinit
-import pycuda.gpuarray as gpuarray
-
import cufinufft
+from cufinufft import _compat
import utils
DTYPES = [np.float32, np.float64]
-SHAPES = [(64,), (64, 64), (64, 64, 64)]
+SHAPES = [(16,), (16, 16), (16, 16, 16)]
+N_TRANS = [(), (1,), (2,)]
MS = [256, 1024, 4096]
-TOLS = [1e-2, 1e-3]
+TOLS = [1e-3, 1e-6]
OUTPUT_ARGS = [False, True]
@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("shape", SHAPES)
@pytest.mark.parametrize("M", MS)
@pytest.mark.parametrize("tol", TOLS)
@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_nufft3d1(dtype, M, tol, output_arg):
- shape = (64, 64, 64)
-
+def test_simple_type1(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
real_dtype = dtype
complex_dtype = utils._complex_dtype(dtype)
dim = len(shape)
- k = utils.gen_nu_pts(M, dim=dim).astype(real_dtype)
- c = utils.gen_nonuniform_data(M).astype(complex_dtype)
+ # Select which function to call based on dimension.
+ fun = {1: cufinufft.nufft1d1,
+ 2: cufinufft.nufft2d1,
+ 3: cufinufft.nufft3d1}[dim]
- k_gpu = gpuarray.to_gpu(k)
- c_gpu = gpuarray.to_gpu(c)
+ k, c = utils.type1_problem(dtype, shape, M, n_trans=n_trans)
- if output_arg:
- fk_gpu = gpuarray.GPUArray(shape, dtype=complex_dtype)
- cufinufft.nufft3d1(*k_gpu, c_gpu, out=fk_gpu, eps=tol)
- else:
- fk_gpu = cufinufft.nufft3d1(*k_gpu, c_gpu, shape, eps=tol)
+ k_gpu = to_gpu(k)
+ c_gpu = to_gpu(c)
- fk = fk_gpu.get()
-
- ind = int(0.1789 * np.prod(shape))
+ if output_arg:
+ # Ensure that output array has proper shape i.e., (N1, ...) for no
+ # batch, (1, N1, ...) for batch of size one, and (n, N1, ...) for
+ # batch of size n.
+ fk_gpu = _compat.array_empty_like(c_gpu, n_trans + shape,
+ dtype=complex_dtype)
- fk_est = fk.ravel()[ind]
- fk_target = utils.direct_type1(c, k, shape, ind)
+ fun(*k_gpu, c_gpu, out=fk_gpu, eps=tol)
+ else:
+ fk_gpu = fun(*k_gpu, c_gpu, shape, eps=tol)
- type1_rel_err = np.abs(fk_target - fk_est) / np.abs(fk_target)
+ fk = to_cpu(fk_gpu)
- assert type1_rel_err < 10 * tol
+ utils.verify_type1(k, c, fk, tol)
@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_trans", N_TRANS)
@pytest.mark.parametrize("M", MS)
@pytest.mark.parametrize("tol", TOLS)
@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
-def test_nufft3d2(dtype, M, tol, output_arg):
- shape = (64, 64, 64)
-
+def test_simple_type2(to_gpu, to_cpu, dtype, shape, n_trans, M, tol, output_arg):
real_dtype = dtype
complex_dtype = utils._complex_dtype(dtype)
dim = len(shape)
- k = utils.gen_nu_pts(M, dim=dim).astype(real_dtype)
- fk = utils.gen_uniform_data(shape).astype(complex_dtype)
-
- k_gpu = gpuarray.to_gpu(k)
- fk_gpu = gpuarray.to_gpu(fk)
+ fun = {1: cufinufft.nufft1d2,
+ 2: cufinufft.nufft2d2,
+ 3: cufinufft.nufft3d2}[dim]
- if output_arg:
- c_gpu = gpuarray.GPUArray((M,), dtype=complex_dtype)
- cufinufft.nufft3d2(*k_gpu, fk_gpu, out=c_gpu)
- else:
- c_gpu = cufinufft.nufft3d2(*k_gpu, fk_gpu)
+ k, fk = utils.type2_problem(dtype, shape, M, n_trans=n_trans)
- c = c_gpu.get()
+ k_gpu = to_gpu(k)
+ fk_gpu = to_gpu(fk)
- ind = M // 2
+ if output_arg:
+ c_gpu = _compat.array_empty_like(fk_gpu, n_trans + (M,),
+ dtype=complex_dtype)
- c_est = c[ind]
- c_target = utils.direct_type2(fk, k[:, ind])
+ fun(*k_gpu, fk_gpu, eps=tol, out=c_gpu)
+ else:
+ c_gpu = fun(*k_gpu, fk_gpu, eps=tol)
- type2_rel_err = np.abs(c_target - c_est) / np.abs(c_target)
+ c = to_cpu(c_gpu)
- assert type2_rel_err < 10 * tol
+ utils.verify_type2(k, fk, c, tol)
diff --git a/python/cufinufft/tests/utils.py b/python/cufinufft/tests/utils.py
index 9f7e70a63..9ea3281f3 100644
--- a/python/cufinufft/tests/utils.py
+++ b/python/cufinufft/tests/utils.py
@@ -37,35 +37,121 @@ def gen_uniform_data(shape, seed=0):
return fk
-def gen_nonuniform_data(M, seed=0):
+def gen_nonuniform_data(M, seed=0, n_trans=()):
np.random.seed(seed)
- c = np.random.standard_normal(2 * M)
+ c = np.random.standard_normal(2 * M * int(np.prod(n_trans)))
c = c.astype(np.float64).view(np.complex128)
+ c = c.reshape(n_trans + (M,))
return c
+def type1_problem(dtype, shape, M, n_trans=()):
+ complex_dtype = _complex_dtype(dtype)
+ dim = len(shape)
+
+ k = gen_nu_pts(M, dim=dim).astype(dtype)
+ c = gen_nonuniform_data(M, n_trans=n_trans).astype(complex_dtype)
+
+ return k, c
+
+
+def type2_problem(dtype, shape, M, n_trans=()):
+ complex_dtype = _complex_dtype(dtype)
+ dim = len(shape)
+
+ k = gen_nu_pts(M, dim=dim).astype(dtype)
+ fk = gen_uniform_data(n_trans + shape).astype(complex_dtype)
+
+ return k, fk
+
+
def make_grid(shape):
dim = len(shape)
- shape = (1,) * (3 - dim) + shape
+ shape = shape
grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape]
- x, y, z = np.meshgrid(*grids, indexing='ij')
- return np.stack((x, y, z))
+ grids = np.meshgrid(*grids, indexing='ij')
+ return np.stack(grids)
def direct_type1(c, k, shape, ind):
+ dim = len(shape)
+
grid = make_grid(shape)
- phase = k.T @ grid.reshape((3, -1))[:, ind]
- fk = np.sum(c * np.exp(1j * phase))
+ phase = k.T @ grid.reshape((dim, -1))[:, ind]
+ fk = np.sum(c * np.exp(1j * phase), -1)
return fk
-def direct_type2(fk, k):
- grid = make_grid(fk.shape)
+def direct_type2(fk, k, dim):
+ grid = make_grid(fk.shape[-dim:])
- phase = k @ grid.reshape((3, -1))
- c = np.sum(fk.ravel() * np.exp(-1j * phase))
+ phase = k @ grid.reshape((dim, -1))
+
+ # Ravel the spatial dimensions only.
+ fk = fk.reshape(fk.shape[:-dim] + (np.prod(fk.shape[-dim:]),))
+ c = np.sum(fk * np.exp(-1j * phase), -1)
return c
+
+
+def verify_type1(k, c, fk, tol):
+ dim = fk.ndim - (c.ndim - 1)
+
+ shape = fk.shape[-dim:]
+
+ ind = int(0.1789 * np.prod(shape))
+
+ # Ravel the spatial dimensions only.
+ fk_est = fk.reshape(fk.shape[:-dim] + (np.prod(fk.shape[-dim:]),))[..., ind]
+ fk_target = direct_type1(c, k, shape, ind)
+
+ type1_rel_err = np.linalg.norm(fk_target - fk_est) / np.linalg.norm(fk_target)
+
+ assert type1_rel_err < 25 * tol
+
+
+def verify_type2(k, fk, c, tol):
+ dim = fk.ndim - (c.ndim - 1)
+
+ M = c.shape[-1]
+
+ ind = M // 2
+
+ c_est = c[..., ind]
+ c_target = direct_type2(fk, k[:, ind], dim)
+
+ type2_rel_err = np.linalg.norm(c_target - c_est) / np.linalg.norm(c_target)
+
+ assert type2_rel_err < 25 * tol
+
+
+def transfer_funcs(module_name):
+ if module_name == "pycuda":
+ import pycuda.autoinit # NOQA:401
+ from pycuda.gpuarray import to_gpu
+ def to_cpu(obj):
+ return obj.get()
+ elif module_name == "cupy":
+ import cupy
+ def to_gpu(obj):
+ return cupy.array(obj)
+ def to_cpu(obj):
+ return obj.get()
+ elif module_name == "numba":
+ import numba.cuda
+ to_gpu = numba.cuda.to_device
+ def to_cpu(obj):
+ return obj.copy_to_host()
+ elif module_name == "torch":
+ import torch
+ def to_gpu(obj):
+ return torch.as_tensor(obj, device=torch.device("cuda"))
+ def to_cpu(obj):
+ return obj.cpu().numpy()
+ else:
+ raise TypeError(f"Unsupported framework: {module_name}")
+
+ return to_gpu, to_cpu
diff --git a/python/finufft/finufft/__init__.py b/python/finufft/finufft/__init__.py
index b98791c76..adb129d7e 100644
--- a/python/finufft/finufft/__init__.py
+++ b/python/finufft/finufft/__init__.py
@@ -17,4 +17,4 @@
from finufft._interfaces import nufft2d1,nufft2d2,nufft2d3
from finufft._interfaces import nufft3d1,nufft3d2,nufft3d3
-__version__ = '2.1.0'
+__version__ = '2.2.0.dev0'
diff --git a/python/finufft/finufft/_interfaces.py b/python/finufft/finufft/_interfaces.py
index d4d26dd4b..e7197ecba 100644
--- a/python/finufft/finufft/_interfaces.py
+++ b/python/finufft/finufft/_interfaces.py
@@ -100,26 +100,41 @@ def __init__(self,nufft_type,n_modes_or_dim,n_trans=1,eps=1e-6,isign=None,dtype=
_finufft._default_opts(opts)
setkwopts(opts,**kwargs)
+ dtype = np.dtype(dtype)
+
+ if dtype == np.float64:
+ warnings.warn("Real dtypes are currently deprecated and will be "
+ "removed in version 2.3. Converting to complex128.",
+ DeprecationWarning)
+ dtype = np.complex128
+
+ if dtype == np.float32:
+ warnings.warn("Real dtypes are currently deprecated and will be "
+ "removed in version 2.3. Converting to complex64.",
+ DeprecationWarning)
+ dtype = np.complex64
+
is_single = is_single_dtype(dtype)
# construct plan based on precision type and eps default value
plan = c_void_p(None)
# setting n_modes and dim for makeplan
- n_modes = np.ones([3], dtype=np.int64)
if nufft_type==3:
npdim = np.asarray(n_modes_or_dim, dtype=np.int64)
if npdim.size != 1:
raise RuntimeError('FINUFFT type 3 plan n_modes_or_dim must be one number, the dimension')
dim = int(npdim)
+ n_modes = np.ones([dim], dtype=np.int64)
else:
npmodes = np.asarray(n_modes_or_dim, dtype=np.int64)
if npmodes.size>3 or npmodes.size<1:
raise RuntimeError("FINUFFT n_modes dimension must be 1, 2, or 3")
dim = int(npmodes.size)
+ n_modes = np.ones([dim], dtype=np.int64)
n_modes[0:dim] = npmodes[::-1]
- n_modes = (c_longlong * 3)(*n_modes)
+ n_modes = (c_longlong * dim)(*n_modes)
if is_single:
self._makeplan = _finufft._makeplanf
@@ -204,8 +219,6 @@ def setpts(self,x=None,y=None,z=None,s=None,t=None,u=None):
ier = self._setpts(self.inner_plan, self.nj, self._yj, self._xj, self._zj, self.nk, self._t, self._s, self._u)
elif self.dim == 3:
ier = self._setpts(self.inner_plan, self.nj, self._zj, self._yj, self._xj, self.nk, self._u, self._t, self._s)
- else:
- raise RuntimeError("FINUFFT dimension must be 1, 2, or 3")
if ier != 0:
err_handler(ier)
@@ -242,9 +255,7 @@ def execute(self,data,out=None):
dim = self.dim
if tp==1 or tp==2:
- ms = self.n_modes[0]
- mt = self.n_modes[1]
- mu = self.n_modes[2]
+ ms, mt, mu = [*self.n_modes, *([1]*(3-len(self.n_modes)))]
# input shape and size check
if tp==2:
@@ -264,11 +275,11 @@ def execute(self,data,out=None):
# allocate out if None
if out is None:
if tp==1:
- _out = np.squeeze(np.zeros([n_trans, mu, mt, ms], dtype=self.dtype, order='C'))
+ _out = np.zeros([*data.shape[:-1], *self.n_modes[::-1]], dtype=self.dtype, order='C')
if tp==2:
- _out = np.squeeze(np.zeros([n_trans, nj], dtype=self.dtype, order='C'))
+ _out = np.zeros([*data.shape[:-dim], nj], dtype=self.dtype, order='C')
if tp==3:
- _out = np.squeeze(np.zeros([n_trans, nk], dtype=self.dtype, order='C'))
+ _out = np.zeros([*data.shape[:-1], nk], dtype=self.dtype, order='C')
# call execute based on type and precision type
if tp==1 or tp==3:
@@ -279,19 +290,12 @@ def execute(self,data,out=None):
ier = self._execute(self.inner_plan,
_out.ctypes.data_as(c_void_p),
_data.ctypes.data_as(c_void_p))
- else:
- ier = 10
# check error
if ier != 0:
err_handler(ier)
- # return out
- if out is None:
- return _out
- else:
- _copy(_out,out)
- return out
+ return _out
def __del__(self):
@@ -327,15 +331,6 @@ def _ensure_array_type(x, name, dtype, output=False):
return x
-### David Stein's functions for checking input and output variables
-def _copy(_x, x):
- """
- Copy _x to x, only if the underlying data of _x differs from that of x
- """
- if _x is not x:
- x[:] = _x
-
-
### error handler (keep up to date with FINUFFT/include/defs.h)
def err_handler(ier):
switcher = {
@@ -480,9 +475,9 @@ def valid_fshape(fshape,n_trans,dim,ms,mt,mu,nk,tp):
def is_single_dtype(dtype):
dtype = np.dtype(dtype)
- if dtype == np.dtype('float64') or dtype == np.dtype('complex128'):
+ if dtype == np.dtype('complex128'):
return False
- elif dtype == np.dtype('float32') or dtype == np.dtype('complex64'):
+ elif dtype == np.dtype('complex64'):
return True
else:
raise RuntimeError('FINUFFT dtype(precision type) must be single or double')
@@ -505,22 +500,20 @@ def setkwopts(opt,**kwargs):
### destroy
def destroy(plan):
- if plan is None:
- return
+ if hasattr(plan, "inner_plan"):
+ ier = plan._destroy(plan.inner_plan)
- ier = plan._destroy(plan.inner_plan)
-
- if ier != 0:
- err_handler(ier)
+ if ier != 0:
+ err_handler(ier)
### invoke guru interface, this function is used for simple interfaces
def invoke_guru(dim,tp,x,y,z,c,s,t,u,f,isign,eps,n_modes,**kwargs):
# infer dtype from x
if x.dtype is np.dtype('float64'):
- pdtype = 'double'
+ pdtype = 'complex128'
elif x.dtype is np.dtype('float32'):
- pdtype = 'single'
+ pdtype = 'complex64'
else:
raise RuntimeError('FINUFFT x dtype should be float64 for double precision or float32 for single precision')
# check n_modes type, n_modes must be a tuple or an integer
diff --git a/python/finufft/setup.py b/python/finufft/setup.py
index 9c89b19ed..acd0daae0 100644
--- a/python/finufft/setup.py
+++ b/python/finufft/setup.py
@@ -5,11 +5,12 @@
# attempt ../make.inc reading (failed) and default finufftdir. 2/25/20
# Barnett trying to get sphinx.ext.autodoc to work w/ this, 10/5/20
-__version__ = '2.1.0'
+__version__ = '2.2.0.dev0'
from setuptools import setup, Extension
import os
import platform
+from pathlib import Path
from tempfile import mkstemp
@@ -18,8 +19,7 @@
# Note: This will not work if run through pip install since setup.py is copied
# to a different location.
if finufft_dir == None or finufft_dir == '':
- current_path = os.path.abspath(__file__)
- finufft_dir = os.path.dirname(os.path.dirname(current_path))
+ finufft_dir = Path(__file__).resolve().parents[2]
# Set include and library paths relative to FINUFFT root directory.
inc_dir = os.path.join(finufft_dir, 'include')
diff --git a/python/finufft/test/accuracy_speed_tests.py b/python/finufft/test/accuracy_speed_tests.py
index 2b690d491..78de3d7dd 100644
--- a/python/finufft/test/accuracy_speed_tests.py
+++ b/python/finufft/test/accuracy_speed_tests.py
@@ -21,7 +21,8 @@ def print_report(label,elapsed,Xest,Xtrue,npts):
print(label+':')
print(' Est rel l2 err %.3g' % (compute_error(Xest,Xtrue)))
print(' CPU time (sec) %.3g' % (elapsed))
- print(' tot NU pts/sec %.3g' % (npts/elapsed))
+ if (elapsed>0):
+ print(' tot NU pts/sec %.3g' % (npts/elapsed))
print('')
def accuracy_speed_tests(num_nonuniform_points,num_uniform_points,eps):
diff --git a/python/finufft/test/test_finufft_plan.py b/python/finufft/test/test_finufft_plan.py
new file mode 100644
index 000000000..2bc0d7d64
--- /dev/null
+++ b/python/finufft/test/test_finufft_plan.py
@@ -0,0 +1,245 @@
+import pytest
+
+import numpy as np
+
+from finufft import Plan
+
+import utils
+
+
+# NOTE: Only doing single transformations for now.
+SHAPES = [(7,), (8,), (7, 7), (7, 8), (8, 8), (7, 7, 7), (7, 8, 8), (8, 8, 8)]
+N_PTS = [10, 11]
+DTYPES = [np.complex64, np.complex128]
+OUTPUT_ARGS = [False, True]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft1_plan(dtype, shape, n_pts, output_arg):
+ pts, coefs = utils.type1_problem(dtype, shape, n_pts)
+
+ plan = Plan(1, shape, dtype=dtype)
+
+ plan.setpts(*pts)
+
+ if not output_arg:
+ sig = plan.execute(coefs)
+ else:
+ sig = np.empty(shape, dtype=dtype)
+ plan.execute(coefs, out=sig)
+
+ utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft2_plan(dtype, shape, n_pts, output_arg):
+ pts, sig = utils.type2_problem(dtype, shape, n_pts)
+
+ plan = Plan(2, shape, dtype=dtype)
+
+ plan.setpts(*pts)
+
+ if not output_arg:
+ coefs = plan.execute(sig)
+ else:
+ coefs = np.empty(n_pts, dtype=dtype)
+ plan.execute(sig, out=coefs)
+
+ utils.verify_type2(pts, sig, coefs, 1e-6)
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dim", list(set(len(shape) for shape in SHAPES)))
+@pytest.mark.parametrize("n_source_pts", N_PTS)
+@pytest.mark.parametrize("n_target_pts", N_PTS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft3_plan(dtype, dim, n_source_pts, n_target_pts, output_arg):
+ source_pts, source_coefs, target_pts = utils.type3_problem(dtype,
+ dim, n_source_pts, n_target_pts)
+
+ plan = Plan(3, dim, dtype=dtype)
+
+ plan.setpts(*source_pts, *((None,) * (3 - dim)), *target_pts)
+
+ if not output_arg:
+ target_coefs = plan.execute(source_coefs)
+ else:
+ target_coefs = np.empty(n_target_pts, dtype=dtype)
+ plan.execute(source_coefs, out=target_coefs)
+
+ utils.verify_type3(source_pts, source_coefs, target_pts, target_coefs, 1e-6)
+
+
+def test_finufft_plan_modeord():
+ dtype = "complex64"
+ shape = (8, 8)
+ n_pts = 17
+
+ plan = Plan(1, shape, dtype=dtype, modeord=1)
+
+ pts, coefs = utils.type1_problem(dtype, shape, n_pts)
+
+ plan.setpts(*pts)
+
+ sig = plan.execute(coefs)
+
+ sig = np.fft.fftshift(sig)
+
+ utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+
+def test_finufft_plan_errors():
+ with pytest.raises(RuntimeError, match="must be single or double"):
+ Plan(1, (8, 8), dtype="uint32")
+
+ with pytest.warns(Warning, match="finufft_opts does not have"):
+ Plan(1, (8, 8), foo="bar")
+
+ with pytest.raises(RuntimeError, match="type 3 plan n_modes_or_dim must be"):
+ Plan(3, (1, 2))
+
+ with pytest.raises(RuntimeError, match="n_modes dimension must be 1, 2"):
+ Plan(2, (1, 2, 3, 4))
+
+ with pytest.warns(Warning, match="eps tolerance too small"):
+ Plan(1, (8, 8), eps=1e-30)
+
+ with pytest.raises(TypeError, match="does not have the correct dtype"):
+ Plan(1, (8, 8), dtype="complex64").setpts(np.ones(1, dtype="complex128"))
+
+ with pytest.raises(TypeError, match="the following requirement: C"):
+ plan = Plan(1, (8, 8), dtype="complex64")
+ plan.setpts(*np.ones((2, 1), dtype="float32"))
+ plan.execute(np.ones(1, dtype="complex64"), out=np.ones((8, 8),
+ dtype="complex64", order="F"))
+
+ with pytest.raises(TypeError, match="the following requirement: W"):
+ plan = Plan(1, (8, 8), dtype="complex64")
+ plan.setpts(*np.ones((2, 1), dtype="float32"))
+ out = np.ones((8, 8), dtype="complex64")
+ out.setflags(write=False)
+ plan.execute(np.ones(1, dtype="complex64"), out=out)
+
+ with pytest.warns(Warning, match="the following requirement: C. Copying"):
+ plan = Plan(2, (8, 8), dtype="complex64")
+ plan.setpts(*np.ones((2, 1), dtype="float32"))
+ plan.execute(np.ones((8, 8), dtype="complex64", order="F"))
+
+ vec = np.ones(1, dtype="float32")
+ not_vec = np.ones((2, 1), dtype="float32")
+
+ with pytest.raises(RuntimeError, match="x must be a vector"):
+ Plan(1, (8, 8, 8), dtype="complex64").setpts(not_vec, vec, vec)
+
+ with pytest.raises(RuntimeError, match="y must be a vector"):
+ Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, not_vec, vec)
+
+ with pytest.raises(RuntimeError, match="z must be a vector"):
+ Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, vec, not_vec)
+
+ with pytest.raises(RuntimeError, match="s must be a vector"):
+ Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, not_vec, vec, vec)
+
+ with pytest.raises(RuntimeError, match="t must be a vector"):
+ Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, not_vec, vec)
+
+ with pytest.raises(RuntimeError, match="u must be a vector"):
+ Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, vec, not_vec)
+
+ vec = np.ones(3, dtype="float32")
+ long_vec = np.ones(4, dtype="float32")
+
+ with pytest.raises(RuntimeError, match="y must have same length as x"):
+ Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, long_vec, vec)
+
+ with pytest.raises(RuntimeError, match="z must have same length as x"):
+ Plan(1, (8, 8, 8), dtype="complex64").setpts(vec, vec, long_vec)
+
+ with pytest.raises(RuntimeError, match="t must have same length as s"):
+ Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, long_vec, vec)
+
+ with pytest.raises(RuntimeError, match="u must have same length as s"):
+ Plan(3, 3, dtype="complex64").setpts(vec, vec, vec, vec, vec, long_vec)
+
+ with pytest.raises(RuntimeError, match="c.ndim must be 1 if n_trans = 1"):
+ plan = Plan(1, (8,), dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones((2, 1), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match="c.size must be same as x.size"):
+ plan = Plan(1, (8,), dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones(4, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match="c.ndim must be 2 if n_trans > 1"):
+ plan = Plan(1, (8,), n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones(2, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"c.shape must be \(n_trans, x.size\)"):
+ plan = Plan(1, (8,), n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones((2, 4), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match="same as the problem dimension for"):
+ plan = Plan(2, (8,), dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones((2, 8), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"same as the problem dimension \+ 1 for"):
+ plan = Plan(2, (8,), n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones(8, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.shape\[0\] must be n_trans"):
+ plan = Plan(2, (8,), n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones((3, 8), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+ plan = Plan(2, (8,), dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"))
+ plan.execute(np.ones(2, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+ plan = Plan(2, (8, 9), dtype="complex64")
+ plan.setpts(*np.ones((2, 3), dtype="float32"))
+ plan.execute(np.ones((2, 9), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.shape is not consistent"):
+ plan = Plan(2, (8, 9, 10), dtype="complex64")
+ plan.setpts(*np.ones((3, 3), dtype="float32"))
+ plan.execute(np.ones((2, 9, 10), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.ndim must be 1"):
+ plan = Plan(3, 1, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+ plan.execute(np.ones(3, dtype="complex64"), out=np.ones((2, 3), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.size of must be nk"):
+ plan = Plan(3, 1, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+ plan.execute(np.ones(3, dtype="complex64"), out=np.ones(4, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.ndim must be 2"):
+ plan = Plan(3, 1, n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+ plan.execute(np.ones((2, 3), dtype="complex64"), out=np.ones(3, dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"f\.shape must be \(n_trans, nk\)"):
+ plan = Plan(3, 1, n_trans=2, dtype="complex64")
+ plan.setpts(np.ones(3, dtype="float32"), s=np.ones(3, dtype="float32"))
+ plan.execute(np.ones((2, 3), dtype="complex64"), out=np.ones((2, 4), dtype="complex64"))
+
+ with pytest.raises(RuntimeError, match=r"point out of range \[-3pi,3pi\]"):
+ plan = Plan(1, (8,), dtype="complex64")
+ plan.setpts(4 * np.pi * np.ones(1, dtype="float32"))
+
+ with pytest.raises(RuntimeError, match="transform type invalid"):
+ plan = Plan(4, (8,))
diff --git a/python/finufft/test/test_finufft_simple.py b/python/finufft/test/test_finufft_simple.py
new file mode 100644
index 000000000..29008b8bb
--- /dev/null
+++ b/python/finufft/test/test_finufft_simple.py
@@ -0,0 +1,123 @@
+import pytest
+
+import numpy as np
+import finufft
+
+import utils
+
+
+SHAPES = [(7,), (8,), (7, 7), (7, 8), (8, 8), (7, 7, 7), (7, 8, 8), (8, 8, 8)]
+N_PTS = [10, 11]
+N_TRANS = [(), (2,)]
+DTYPES = [np.complex64, np.complex128]
+OUTPUT_ARGS = [False, True]
+
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft1_simple(dtype, shape, n_pts, n_trans, output_arg):
+ dim = len(shape)
+
+ funs = {1: finufft.nufft1d1,
+ 2: finufft.nufft2d1,
+ 3: finufft.nufft3d1}
+
+ fun = funs[dim]
+
+ pts, coefs = utils.type1_problem(dtype, shape, n_pts, n_trans)
+
+ # See if it can handle square sizes from ints
+ if all(n == shape[0] for n in shape):
+ _shape = shape[0]
+ else:
+ _shape = shape
+
+ if not output_arg:
+ sig = fun(*pts, coefs, _shape)
+ else:
+ sig = np.empty(n_trans + shape, dtype=dtype)
+ fun(*pts, coefs, out=sig)
+
+ utils.verify_type1(pts, coefs, shape, sig, 1e-6)
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("shape", SHAPES)
+@pytest.mark.parametrize("n_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft2_simple(dtype, shape, n_pts, n_trans, output_arg):
+ dim = len(shape)
+
+ funs = {1: finufft.nufft1d2,
+ 2: finufft.nufft2d2,
+ 3: finufft.nufft3d2}
+
+ fun = funs[dim]
+
+ pts, sig = utils.type2_problem(dtype, shape, n_pts, n_trans)
+
+ if not output_arg:
+ coefs = fun(*pts, sig)
+ else:
+ coefs = np.empty(n_trans + (n_pts,), dtype=dtype)
+ fun(*pts, sig, out=coefs)
+
+ utils.verify_type2(pts, sig, coefs, 1e-6)
+
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("dim", list(set(len(shape) for shape in SHAPES)))
+@pytest.mark.parametrize("n_source_pts", N_PTS)
+@pytest.mark.parametrize("n_target_pts", N_PTS)
+@pytest.mark.parametrize("n_trans", N_TRANS)
+@pytest.mark.parametrize("output_arg", OUTPUT_ARGS)
+def test_finufft3_simple(dtype, dim, n_source_pts, n_target_pts, n_trans, output_arg):
+ funs = {1: finufft.nufft1d3,
+ 2: finufft.nufft2d3,
+ 3: finufft.nufft3d3}
+
+ fun = funs[dim]
+
+ source_pts, source_coefs, target_pts = utils.type3_problem(dtype,
+ dim, n_source_pts, n_target_pts, n_trans)
+
+ if not output_arg:
+ target_coefs = fun(*source_pts, source_coefs, *target_pts)
+ else:
+ target_coefs = np.empty(n_trans + (n_target_pts,), dtype=dtype)
+ fun(*source_pts, source_coefs, *target_pts, out=target_coefs)
+
+ utils.verify_type3(source_pts, source_coefs, target_pts, target_coefs, 1e-6)
+
+def test_finufft_simple_errors():
+ with pytest.raises(RuntimeError, match="x dtype should be"):
+ finufft.nufft1d1(np.zeros(1, "int64"), np.zeros(1), (4,))
+
+ with pytest.raises(RuntimeError, match="n_modes must be"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1), "")
+
+ with pytest.raises(RuntimeError, match="n_modes dimension does not match"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1), (2, 2))
+
+ with pytest.raises(RuntimeError, match="elements of input n_modes must"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1), (2.5,))
+
+ with pytest.raises(RuntimeError, match="type 1 input must supply n_modes"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1))
+
+ with pytest.raises(RuntimeError, match="c.size must be divisible by x.size"):
+ finufft.nufft1d1(np.zeros(2), np.zeros(3, np.complex128), 4)
+
+ with pytest.raises(RuntimeError, match="type 2 input dimension must be either dim"):
+ finufft.nufft1d2(np.zeros(1), np.zeros((2, 2, 2), np.complex128))
+
+ with pytest.raises(RuntimeError, match="type 2 input dimension must be either dim"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1, np.complex128), 4, out=np.zeros((2, 2, 4), np.complex128))
+
+ with pytest.raises(RuntimeError, match="input n_trans and output n_trans do not match"):
+ finufft.nufft1d1(np.zeros(1), np.zeros((3, 1), np.complex128), 4, out=np.zeros((2, 4), np.complex128))
+
+ with pytest.raises(RuntimeError, match="input n_modes and output n_modes do not match"):
+ finufft.nufft1d1(np.zeros(1), np.zeros(1, np.complex128), 4, out=np.zeros((3), np.complex128))
diff --git a/python/finufft/test/utils.py b/python/finufft/test/utils.py
new file mode 100644
index 000000000..5d372f7f0
--- /dev/null
+++ b/python/finufft/test/utils.py
@@ -0,0 +1,194 @@
+import numpy as np
+
+
+def _complex_dtype(dtype):
+ dtype = np.dtype(dtype)
+
+ if dtype == np.float32:
+ complex_dtype = np.complex64
+ elif dtype == np.float64:
+ complex_dtype = np.complex128
+ else:
+ raise TypeError("dtype should be np.float32 or np.float64.")
+
+ return complex_dtype
+
+
+def _real_dtype(complex_dtype):
+ complex_dtype = np.dtype(complex_dtype)
+
+ if complex_dtype == np.complex64:
+ real_dtype = np.float32
+ elif complex_dtype == np.complex128:
+ real_dtype = np.float64
+ else:
+ raise TypeError("dtype should be np.complex64 or np.complex128.")
+
+ return real_dtype
+
+
+def gen_nu_pts(M, dim=3, seed=0):
+ np.random.seed(seed)
+ k = np.random.uniform(-np.pi, np.pi, (dim, M))
+ k = k.astype(np.float64)
+ return k
+
+
+def gen_uniform_data(shape, seed=0):
+ np.random.seed(seed)
+ fk = np.random.standard_normal(shape + (2,))
+ fk = fk.astype(np.float64).view(np.complex128)[..., 0]
+ return fk
+
+
+def gen_nonuniform_data(M, seed=0, n_trans=()):
+ np.random.seed(seed)
+ c = np.random.standard_normal(2 * M * int(np.prod(n_trans)))
+ c = c.astype(np.float64).view(np.complex128)
+ c = c.reshape(n_trans + (M,))
+ return c
+
+
+def gen_sig_idx(n_modes, n_tr):
+ idx = tuple(np.random.randint(0, n) for n in n_tr + n_modes)
+ return idx
+
+
+def gen_coef_ind(n_pts, n_tr):
+ ind = tuple(np.random.randint(0, n) for n in n_tr + (n_pts,))
+ return ind
+
+
+def type1_problem(dtype, shape, M, n_trans=()):
+ real_dtype = _real_dtype(dtype)
+ dim = len(shape)
+
+ k = gen_nu_pts(M, dim=dim).astype(real_dtype)
+ c = gen_nonuniform_data(M, n_trans=n_trans).astype(dtype)
+
+ return k, c
+
+
+def type2_problem(dtype, shape, M, n_trans=()):
+ real_dtype = _real_dtype(dtype)
+ dim = len(shape)
+
+ k = gen_nu_pts(M, dim=dim).astype(real_dtype)
+ fk = gen_uniform_data(n_trans + shape).astype(dtype)
+
+ return k, fk
+
+def type3_problem(dtype, dim, n_source_pts, n_target_pts, n_trans=()):
+ real_dtype = _real_dtype(dtype)
+
+ source_pts = gen_nu_pts(n_source_pts, dim=dim).astype(real_dtype)
+ source_coefs = gen_nonuniform_data(n_source_pts, n_trans=n_trans).astype(dtype)
+ target_pts = gen_nu_pts(n_target_pts, dim=dim).astype(real_dtype)
+
+ return source_pts, source_coefs, target_pts
+
+
+def make_grid(shape):
+ dim = len(shape)
+ shape = shape
+
+ grids = [np.arange(-(N // 2), (N + 1) // 2) for N in shape]
+ grids = np.meshgrid(*grids, indexing='ij')
+ return np.stack(grids)
+
+
+def direct_type1(pts, coefs, n_modes, idx):
+ dtype = coefs.dtype
+ dim = len(n_modes)
+
+ _idx = (np.array(idx[-dim:]) - np.floor(np.array(n_modes) / 2)).astype(dtype)
+
+ _coefs = coefs[idx[:-dim]]
+
+ sig = np.sum(np.exp(1j * np.sum(_idx[:, np.newaxis] * pts, axis=0)) * _coefs)
+
+ return sig
+
+
+def direct_type2(pts, sig, ind):
+ dtype = sig.dtype
+ dim = pts.shape[0]
+ n_modes = sig.shape[-dim:]
+
+ grids = [slice(-np.floor(n / 2), np.ceil(n / 2)) for n in n_modes]
+ grid = np.mgrid[grids].astype(dtype)
+
+ pt = pts[:, ind[-1]]
+ pt = pt.reshape((dim,) + dim * (1,))
+
+ _sig = sig[ind[:-1]]
+
+ coef = np.sum(np.exp(-1j * np.sum(pt * grid, axis=0)) * _sig)
+
+ return coef
+
+
+def direct_type3(source_pts, source_coefs, target_pts, ind):
+ target_pt = target_pts[:, ind[-1]]
+ target_pt = target_pt[:, np.newaxis]
+
+ _source_coef = source_coefs[ind[:-1]]
+
+ target_coef = np.sum(np.exp(1j * np.sum(target_pt * source_pts, axis=0)) * _source_coef)
+
+ return target_coef
+
+
+def verify_type1(pts, coefs, shape, sig_est, tol):
+ dim = pts.shape[0]
+
+ n_trans = coefs.shape[:-1]
+
+ assert sig_est.shape[:-dim] == n_trans
+ assert sig_est.shape[-dim:] == shape
+
+ idx = gen_sig_idx(shape, n_trans)
+
+ fk_est = sig_est[idx]
+ fk_target = direct_type1(pts, coefs, shape, idx)
+
+ type1_rel_err = np.linalg.norm(fk_target - fk_est) / np.linalg.norm(fk_target)
+
+ assert type1_rel_err < 25 * tol
+
+
+def verify_type2(pts, sig, coefs_est, tol):
+ dim = pts.shape[0]
+
+ n_trans = sig.shape[:-dim]
+ n_pts = pts.shape[-1]
+
+ assert coefs_est.shape == n_trans + (n_pts,)
+
+ ind = gen_coef_ind(n_pts, n_trans)
+
+ c_est = coefs_est[ind]
+ c_target = direct_type2(pts, sig, ind)
+
+ type2_rel_err = np.linalg.norm(c_target - c_est) / np.linalg.norm(c_target)
+
+ assert type2_rel_err < 25 * tol
+
+
+def verify_type3(source_pts, source_coef, target_pts, target_coef, tol):
+ dim = source_pts.shape[0]
+
+ n_source_pts = source_pts.shape[-1]
+ n_target_pts = target_pts.shape[-1]
+ n_tr = source_coef.shape[:-1]
+
+ assert target_coef.shape == n_tr + (n_target_pts,)
+
+ ind = gen_coef_ind(n_source_pts, n_tr)
+
+ target_est = target_coef[ind]
+ target_true = direct_type3(source_pts, source_coef, target_pts, ind)
+
+ type3_rel_err = np.linalg.norm(target_est - target_true) / np.linalg.norm(target_true)
+
+ assert type3_rel_err < 100 * tol
diff --git a/src/cuda/1d/cufinufft1d.cu b/src/cuda/1d/cufinufft1d.cu
index 841726376..dce4cf67c 100644
--- a/src/cuda/1d/cufinufft1d.cu
+++ b/src/cuda/1d/cufinufft1d.cu
@@ -1,6 +1,6 @@
#include
#include
-#include
+#include
#include
#include
#include
@@ -33,36 +33,36 @@ int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla
{
assert(d_plan->spopts.spread_direction == 1);
- int blksize;
int ier;
cuda_complex *d_fkstart;
cuda_complex *d_cstart;
for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
- blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+ int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
d_plan->c = d_cstart;
d_plan->fk = d_fkstart;
- checkCudaErrors(
- cudaMemset(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex))); // this is needed
+ // this is needed
+ if ((ier = checkCudaErrors(
+ cudaMemset(d_plan->fw, 0, d_plan->maxbatchsize * d_plan->nf1 * sizeof(cuda_complex)))))
+ return ier;
// Step 1: Spread
- ier = cuspread1d(d_plan, blksize);
-
- if (ier != 0) {
- printf("error: cuspread1d, method(%d)\n", d_plan->opts.gpu_method);
+ if ((ier = cuspread1d(d_plan, blksize)))
return ier;
- }
// Step 2: FFT
- cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+ cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+ if (cufft_status != CUFFT_SUCCESS)
+ return FINUFFT_ERR_CUDA_FAILURE;
// Step 3: deconvolve and shuffle
- cudeconvolve1d(d_plan, blksize);
+ if ((ier = cudeconvolve1d(d_plan, blksize)))
+ return ier;
}
- return ier;
+ return 0;
}
template
@@ -82,12 +82,11 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla
{
assert(d_plan->spopts.spread_direction == 2);
- int blksize;
int ier;
cuda_complex *d_fkstart;
cuda_complex *d_cstart;
for (int i = 0; i * d_plan->maxbatchsize < d_plan->ntransf; i++) {
- blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
+ int blksize = std::min(d_plan->ntransf - i * d_plan->maxbatchsize, d_plan->maxbatchsize);
d_cstart = d_c + i * d_plan->maxbatchsize * d_plan->M;
d_fkstart = d_fk + i * d_plan->maxbatchsize * d_plan->ms;
@@ -95,21 +94,23 @@ int cufinufft1d2_exec(cuda_complex *d_c, cuda_complex *d_fk, cufinufft_pla
d_plan->fk = d_fkstart;
// Step 1: amplify Fourier coeffs fk and copy into upsampled array fw
- cudeconvolve1d(d_plan, blksize);
+ if ((ier = cudeconvolve1d(d_plan, blksize)))
+ return ier;
// Step 2: FFT
cudaDeviceSynchronize();
- cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+ RETURN_IF_CUDA_ERROR
- // Step 3: deconvolve and shuffle
- ier = cuinterp1d(d_plan, blksize);
+ cufftResult cufft_status = cufft_ex(d_plan->fftplan, d_plan->fw, d_plan->fw, d_plan->iflag);
+ if (cufft_status != CUFFT_SUCCESS)
+ return FINUFFT_ERR_CUDA_FAILURE;
- if (ier != 0) {
- printf("error: cuinterp1d, method(%d)\n", d_plan->opts.gpu_method);
+ // Step 3: deconvolve and shuffle
+ if ((ier = cuinterp1d(d_plan, blksize)))
return ier;
- }
}
- return ier;
+
+ return 0;
}
template int cufinufft1d1_exec(cuda_complex *d_c, cuda_complex *d_fk,
diff --git a/src/cuda/1d/interp1d_wrapper.cu b/src/cuda/1d/interp1d_wrapper.cu
index 29037290e..36fb6a0f0 100644
--- a/src/cuda/1d/interp1d_wrapper.cu
+++ b/src/cuda/1d/interp1d_wrapper.cu
@@ -1,5 +1,5 @@
#include
-#include
+#include
#include
#include
@@ -54,15 +54,11 @@ inline int cufinufft_interp1d(int nf1, cuda_complex *d_fw, int M, T *d_kx, cu
}
ier = cuinterp1d(d_plan, 1);
- freegpumemory1d(d_plan);
+ freegpumemory(d_plan);
return ier;
}
-template int cufinufft_interp1d(int nf1, cuda_complex *d_fw, int M, float *d_kx, cuda_complex *d_c,
- cufinufft_plan_t *d_plan);
-template int cufinufft_interp1d(int nf1, cuda_complex *d_fw, int M, double *d_kx, cuda_complex *d_c,
- cufinufft_plan_t *d_plan);
template
int cuinterp1d(cufinufft_plan_t *d_plan, int blksize)
@@ -83,14 +79,10 @@ int cuinterp1d(cufinufft_plan_t *d_plan, int blksize)
switch (d_plan->opts.gpu_method) {
case 1: {
ier = cuinterp1d_nuptsdriven(nf1, M, d_plan, blksize);
- if (ier != 0) {
- std::cout << "error: cnufftspread1d_gpu_nuptsdriven" << std::endl;
- return 1;
- }
} break;
default:
- std::cout << "error: incorrect method, should be 1" << std::endl;
- return 2;
+ std::cerr << "[cuinterp1d] error: incorrect method, should be 1" << std::endl;
+ ier = FINUFFT_ERR_METHOD_NOTVALID;
}
return ier;
@@ -119,20 +111,27 @@ int cuinterp1d_nuptsdriven(int nf1, int M, cufinufft_plan_t