From 2726bc383a12d14411b8c5e1f63d97c736a894c8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 22 Oct 2024 19:20:51 +0200 Subject: [PATCH 01/11] fix PR --- CMakeLists.txt | 10 +- fortran/finufftfort.cpp | 221 +++++------ include/finufft.h | 3 +- include/finufft/defs.h | 137 ------- include/finufft/dirft.h | 42 ++- include/finufft/fft.h | 9 + include/finufft/finufft_core.h | 22 +- include/finufft/spreadinterp.h | 3 +- include/finufft/test_defs.h | 113 +++++- include/finufft/utils.h | 35 +- include/finufft_eitherprec.h | 7 +- makefile | 8 +- perftest/manysmallprobs.cpp | 2 +- perftest/spreadtestnd.cpp | 2 +- perftest/spreadtestndall.cpp | 2 +- src/{simpleinterfaces.cpp => c_interface.cpp} | 248 ++++-------- src/{finufft_core.cpp => finufft.cpp} | 357 ++++++++---------- src/spreadinterp.cpp | 125 +++--- src/utils.cpp | 2 +- test/directft/dirft1d.cpp | 38 +- test/directft/dirft2d.cpp | 66 ++-- test/directft/dirft3d.cpp | 79 ++-- test/dumbinputs.cpp | 2 +- 23 files changed, 696 insertions(+), 837 deletions(-) delete mode 100644 include/finufft/defs.h rename src/{simpleinterfaces.cpp => c_interface.cpp} (55%) rename src/{finufft_core.cpp => finufft.cpp} (82%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e5e2cf5d..3ea8b76c7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -169,7 +169,7 @@ if(FINUFFT_USE_CPU) set(XTL_VERSION 0.7.7) set(XSIMD_VERSION 13.0.0) # using latest ducc0 version for now as it fixes MacOS GCC build - set(DUCC0_VERSION b0beb85e03982344a31ebb119758d7aa3ef5d362) + set(DUCC0_VERSION ducc0_0_35_0) set(FINUFFT_FFTW_LIBRARIES) include(cmake/setupXSIMD.cmake) if(FINUFFT_USE_DUCC0) @@ -257,8 +257,8 @@ if(FINUFFT_USE_CPU) src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp - src/finufft_core.cpp - src/simpleinterfaces.cpp + src/finufft.cpp + src/c_interface.cpp fortran/finufftfort.cpp) else() add_library( @@ -267,8 +267,8 @@ if(FINUFFT_USE_CPU) src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp - src/finufft_core.cpp - src/simpleinterfaces.cpp + src/finufft.cpp + src/c_interface.cpp fortran/finufftfort.cpp) endif() set_finufft_options(finufft) diff --git a/fortran/finufftfort.cpp b/fortran/finufftfort.cpp index 400ff0985..059b44c4c 100644 --- a/fortran/finufftfort.cpp +++ b/fortran/finufftfort.cpp @@ -21,13 +21,19 @@ #include #include +using f32 = float; +using f64 = double; +using c64 = std::complex; +using c128 = std::complex; +using i64 = BIGINT; + #ifdef __cplusplus extern "C" { #endif // --------------------- guru interface from fortran ------------------------ -void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int *n_transf, - double *tol, finufft_plan *plan, finufft_opts *o, int *ier) { +void finufft_makeplan_(int *type, int *n_dims, i64 *n_modes, int *iflag, int *n_transf, + f64 *tol, finufft_plan *plan, finufft_opts *o, int *ier) { if (!plan) fprintf(stderr, "%s fortran: plan must be allocated as at least the size of a C pointer " @@ -39,8 +45,8 @@ void finufft_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, int } } -void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, double *zj, - BIGINT *nk, double *s, double *t, double *u, int *ier) { +void finufft_setpts_(finufft_plan *plan, i64 *M, f64 *xj, f64 *yj, f64 *zj, i64 *nk, + f64 *s, f64 *t, f64 *u, int *ier) { if (!*plan) { fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); return; @@ -50,8 +56,7 @@ void finufft_setpts_(finufft_plan *plan, BIGINT *M, double *xj, double *yj, doub *ier = finufft_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } -void finufft_execute_(finufft_plan *plan, std::complex *weights, - std::complex *result, int *ier) { +void finufft_execute_(finufft_plan *plan, c128 *weights, c128 *result, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else @@ -77,124 +82,104 @@ void finufft_default_opts_(finufft_opts *o) { // -------------- simple and many-vector interfaces -------------------- // --- 1D --- -void finufft1d1_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, - double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, - int *ier) { +void finufft1d1_(i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, i64 *ms, c128 *fk, + finufft_opts *o, int *ier) { *ier = finufft1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufft1d1many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, - int *iflag, double *eps, BIGINT *ms, std::complex *fk, - finufft_opts *o, int *ier) { +void finufft1d1many_(int *ntransf, i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, + i64 *ms, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufft1d2_(BIGINT *nj, double *xj, std::complex *cj, int *iflag, - double *eps, BIGINT *ms, std::complex *fk, finufft_opts *o, - int *ier) { +void finufft1d2_(i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, i64 *ms, c128 *fk, + finufft_opts *o, int *ier) { *ier = finufft1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufft1d2many_(int *ntransf, BIGINT *nj, double *xj, std::complex *cj, - int *iflag, double *eps, BIGINT *ms, std::complex *fk, - finufft_opts *o, int *ier) { +void finufft1d2many_(int *ntransf, i64 *nj, f64 *xj, c128 *cj, int *iflag, f64 *eps, + i64 *ms, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufft1d3_(BIGINT *nj, double *x, std::complex *c, int *iflag, double *eps, - BIGINT *nk, double *s, std::complex *f, finufft_opts *o, - int *ier) { +void finufft1d3_(i64 *nj, f64 *x, c128 *c, int *iflag, f64 *eps, i64 *nk, f64 *s, c128 *f, + finufft_opts *o, int *ier) { *ier = finufft1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); } -void finufft1d3many_(int *ntransf, BIGINT *nj, double *x, std::complex *c, - int *iflag, double *eps, BIGINT *nk, double *s, - std::complex *f, finufft_opts *o, int *ier) { +void finufft1d3many_(int *ntransf, i64 *nj, f64 *x, c128 *c, int *iflag, f64 *eps, + i64 *nk, f64 *s, c128 *f, finufft_opts *o, int *ier) { *ier = finufft1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); } // --- 2D --- -void finufft2d1_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, - double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, - finufft_opts *o, int *ier) { +void finufft2d1_(i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, f64 *eps, i64 *ms, + i64 *mt, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufft2d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, - std::complex *cj, int *iflag, double *eps, BIGINT *ms, - BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { +void finufft2d1many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, + f64 *eps, i64 *ms, i64 *mt, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufft2d2_(BIGINT *nj, double *xj, double *yj, std::complex *cj, int *iflag, - double *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, - finufft_opts *o, int *ier) { +void finufft2d2_(i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, f64 *eps, i64 *ms, + i64 *mt, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufft2d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, - std::complex *cj, int *iflag, double *eps, BIGINT *ms, - BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { +void finufft2d2many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, c128 *cj, int *iflag, + f64 *eps, i64 *ms, i64 *mt, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufft2d3_(BIGINT *nj, double *x, double *y, std::complex *c, int *iflag, - double *eps, BIGINT *nk, double *s, double *t, std::complex *f, - finufft_opts *o, int *ier) { +void finufft2d3_(i64 *nj, f64 *x, f64 *y, c128 *c, int *iflag, f64 *eps, i64 *nk, f64 *s, + f64 *t, c128 *f, finufft_opts *o, int *ier) { *ier = finufft2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } -void finufft2d3many_(int *ntransf, BIGINT *nj, double *x, double *y, - std::complex *c, int *iflag, double *eps, BIGINT *nk, - double *s, double *t, std::complex *f, finufft_opts *o, - int *ier) { +void finufft2d3many_(int *ntransf, i64 *nj, f64 *x, f64 *y, c128 *c, int *iflag, f64 *eps, + i64 *nk, f64 *s, f64 *t, c128 *f, finufft_opts *o, int *ier) { *ier = finufft2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } // --- 3D --- -void finufft3d1_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, - int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, - std::complex *fk, finufft_opts *o, int *ier) { +void finufft3d1_(i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int *iflag, f64 *eps, + i64 *ms, i64 *mt, i64 *mu, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufft3d1many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, - std::complex *cj, int *iflag, double *eps, BIGINT *ms, - BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, - int *ier) { +void finufft3d1many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, + int *iflag, f64 *eps, i64 *ms, i64 *mt, i64 *mu, c128 *fk, + finufft_opts *o, int *ier) { *ier = finufft3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufft3d2_(BIGINT *nj, double *xj, double *yj, double *zj, std::complex *cj, - int *iflag, double *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, - std::complex *fk, finufft_opts *o, int *ier) { +void finufft3d2_(i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int *iflag, f64 *eps, + i64 *ms, i64 *mt, i64 *mu, c128 *fk, finufft_opts *o, int *ier) { *ier = finufft3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufft3d2many_(int *ntransf, BIGINT *nj, double *xj, double *yj, double *zj, - std::complex *cj, int *iflag, double *eps, BIGINT *ms, - BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, - int *ier) { +void finufft3d2many_(int *ntransf, i64 *nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, + int *iflag, f64 *eps, i64 *ms, i64 *mt, i64 *mu, c128 *fk, + finufft_opts *o, int *ier) { *ier = finufft3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufft3d3_(BIGINT *nj, double *x, double *y, double *z, std::complex *c, - int *iflag, double *eps, BIGINT *nk, double *s, double *t, double *u, - std::complex *f, finufft_opts *o, int *ier) { +void finufft3d3_(i64 *nj, f64 *x, f64 *y, f64 *z, c128 *c, int *iflag, f64 *eps, i64 *nk, + f64 *s, f64 *t, f64 *u, c128 *f, finufft_opts *o, int *ier) { *ier = finufft3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void finufft3d3many_(int *ntransf, BIGINT *nj, double *x, double *y, double *z, - std::complex *c, int *iflag, double *eps, BIGINT *nk, - double *s, double *t, double *u, std::complex *f, - finufft_opts *o, int *ier) { +void finufft3d3many_(int *ntransf, i64 *nj, f64 *x, f64 *y, f64 *z, c128 *c, int *iflag, + f64 *eps, i64 *nk, f64 *s, f64 *t, f64 *u, c128 *f, finufft_opts *o, + int *ier) { *ier = finufft3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } // --------------------- guru interface from fortran ------------------------ -void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, - int *n_transf, float *tol, finufftf_plan *plan, finufft_opts *o, - int *ier) { +void finufftf_makeplan_(int *type, int *n_dims, i64 *n_modes, int *iflag, int *n_transf, + f32 *tol, finufftf_plan *plan, finufft_opts *o, int *ier) { if (!plan) fprintf(stderr, "%s fortran: plan must be allocated as at least the size of a C pointer " @@ -206,8 +191,8 @@ void finufftf_makeplan_(int *type, int *n_dims, BIGINT *n_modes, int *iflag, } } -void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, float *zj, - BIGINT *nk, float *s, float *t, float *u, int *ier) { +void finufftf_setpts_(finufftf_plan *plan, i64 *M, f32 *xj, f32 *yj, f32 *zj, i64 *nk, + f32 *s, f32 *t, f32 *u, int *ier) { if (!*plan) { fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); return; @@ -217,8 +202,7 @@ void finufftf_setpts_(finufftf_plan *plan, BIGINT *M, float *xj, float *yj, floa *ier = finufftf_setpts(*plan, *M, xj, yj, zj, nk_safe, s, t, u); } -void finufftf_execute_(finufftf_plan *plan, std::complex *weights, - std::complex *result, int *ier) { +void finufftf_execute_(finufftf_plan *plan, c64 *weights, c64 *result, int *ier) { if (!plan) fprintf(stderr, "%s fortran: finufft_plan unallocated!", __func__); else @@ -244,115 +228,98 @@ void finufftf_default_opts_(finufft_opts *o) { // -------------- simple and many-vector interfaces -------------------- // --- 1D --- -void finufftf1d1_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, - BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { +void finufftf1d1_(i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, i64 *ms, c64 *fk, + finufft_opts *o, int *ier) { *ier = finufftf1d1(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufftf1d1many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, - int *iflag, float *eps, BIGINT *ms, std::complex *fk, - finufft_opts *o, int *ier) { +void finufftf1d1many_(int *ntransf, i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, + i64 *ms, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf1d1many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufftf1d2_(BIGINT *nj, float *xj, std::complex *cj, int *iflag, float *eps, - BIGINT *ms, std::complex *fk, finufft_opts *o, int *ier) { +void finufftf1d2_(i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, i64 *ms, c64 *fk, + finufft_opts *o, int *ier) { *ier = finufftf1d2(*nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufftf1d2many_(int *ntransf, BIGINT *nj, float *xj, std::complex *cj, - int *iflag, float *eps, BIGINT *ms, std::complex *fk, - finufft_opts *o, int *ier) { +void finufftf1d2many_(int *ntransf, i64 *nj, f32 *xj, c64 *cj, int *iflag, f32 *eps, + i64 *ms, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf1d2many(*ntransf, *nj, xj, cj, *iflag, *eps, *ms, fk, o); } -void finufftf1d3_(BIGINT *nj, float *x, std::complex *c, int *iflag, float *eps, - BIGINT *nk, float *s, std::complex *f, finufft_opts *o, - int *ier) { +void finufftf1d3_(i64 *nj, f32 *x, c64 *c, int *iflag, f32 *eps, i64 *nk, f32 *s, c64 *f, + finufft_opts *o, int *ier) { *ier = finufftf1d3(*nj, x, c, *iflag, *eps, *nk, s, f, o); } -void finufftf1d3many_(int *ntransf, BIGINT *nj, float *x, std::complex *c, - int *iflag, float *eps, BIGINT *nk, float *s, - std::complex *f, finufft_opts *o, int *ier) { +void finufftf1d3many_(int *ntransf, i64 *nj, f32 *x, c64 *c, int *iflag, f32 *eps, + i64 *nk, f32 *s, c64 *f, finufft_opts *o, int *ier) { *ier = finufftf1d3many(*ntransf, *nj, x, c, *iflag, *eps, *nk, s, f, o); } // --- 2D --- -void finufftf2d1_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, - float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, - finufft_opts *o, int *ier) { +void finufftf2d1_(i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, f32 *eps, i64 *ms, + i64 *mt, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf2d1(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufftf2d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, - std::complex *cj, int *iflag, float *eps, BIGINT *ms, - BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { +void finufftf2d1many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, + f32 *eps, i64 *ms, i64 *mt, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf2d1many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufftf2d2_(BIGINT *nj, float *xj, float *yj, std::complex *cj, int *iflag, - float *eps, BIGINT *ms, BIGINT *mt, std::complex *fk, - finufft_opts *o, int *ier) { +void finufftf2d2_(i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, f32 *eps, i64 *ms, + i64 *mt, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf2d2(*nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufftf2d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, - std::complex *cj, int *iflag, float *eps, BIGINT *ms, - BIGINT *mt, std::complex *fk, finufft_opts *o, int *ier) { +void finufftf2d2many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, c64 *cj, int *iflag, + f32 *eps, i64 *ms, i64 *mt, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf2d2many(*ntransf, *nj, xj, yj, cj, *iflag, *eps, *ms, *mt, fk, o); } -void finufftf2d3_(BIGINT *nj, float *x, float *y, std::complex *c, int *iflag, - float *eps, BIGINT *nk, float *s, float *t, std::complex *f, - finufft_opts *o, int *ier) { +void finufftf2d3_(i64 *nj, f32 *x, f32 *y, c64 *c, int *iflag, f32 *eps, i64 *nk, f32 *s, + f32 *t, c64 *f, finufft_opts *o, int *ier) { *ier = finufftf2d3(*nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } -void finufftf2d3many_(int *ntransf, BIGINT *nj, float *x, float *y, - std::complex *c, int *iflag, float *eps, BIGINT *nk, - float *s, float *t, std::complex *f, finufft_opts *o, - int *ier) { +void finufftf2d3many_(int *ntransf, i64 *nj, f32 *x, f32 *y, c64 *c, int *iflag, f32 *eps, + i64 *nk, f32 *s, f32 *t, c64 *f, finufft_opts *o, int *ier) { *ier = finufftf2d3many(*ntransf, *nj, x, y, c, *iflag, *eps, *nk, s, t, f, o); } // --- 3D --- -void finufftf3d1_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, - int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, - std::complex *fk, finufft_opts *o, int *ier) { +void finufftf3d1_(i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int *iflag, f32 *eps, + i64 *ms, i64 *mt, i64 *mu, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf3d1(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufftf3d1many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, - std::complex *cj, int *iflag, float *eps, BIGINT *ms, - BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, - int *ier) { +void finufftf3d1many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, + int *iflag, f32 *eps, i64 *ms, i64 *mt, i64 *mu, c64 *fk, + finufft_opts *o, int *ier) { *ier = finufftf3d1many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufftf3d2_(BIGINT *nj, float *xj, float *yj, float *zj, std::complex *cj, - int *iflag, float *eps, BIGINT *ms, BIGINT *mt, BIGINT *mu, - std::complex *fk, finufft_opts *o, int *ier) { +void finufftf3d2_(i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int *iflag, f32 *eps, + i64 *ms, i64 *mt, i64 *mu, c64 *fk, finufft_opts *o, int *ier) { *ier = finufftf3d2(*nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufftf3d2many_(int *ntransf, BIGINT *nj, float *xj, float *yj, float *zj, - std::complex *cj, int *iflag, float *eps, BIGINT *ms, - BIGINT *mt, BIGINT *mu, std::complex *fk, finufft_opts *o, - int *ier) { +void finufftf3d2many_(int *ntransf, i64 *nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, + int *iflag, f32 *eps, i64 *ms, i64 *mt, i64 *mu, c64 *fk, + finufft_opts *o, int *ier) { *ier = finufftf3d2many(*ntransf, *nj, xj, yj, zj, cj, *iflag, *eps, *ms, *mt, *mu, fk, o); } -void finufftf3d3_(BIGINT *nj, float *x, float *y, float *z, std::complex *c, - int *iflag, float *eps, BIGINT *nk, float *s, float *t, float *u, - std::complex *f, finufft_opts *o, int *ier) { +void finufftf3d3_(i64 *nj, f32 *x, f32 *y, f32 *z, c64 *c, int *iflag, f32 *eps, i64 *nk, + f32 *s, f32 *t, f32 *u, c64 *f, finufft_opts *o, int *ier) { *ier = finufftf3d3(*nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } -void finufftf3d3many_(int *ntransf, BIGINT *nj, float *x, float *y, float *z, - std::complex *c, int *iflag, float *eps, BIGINT *nk, - float *s, float *t, float *u, std::complex *f, - finufft_opts *o, int *ier) { +void finufftf3d3many_(int *ntransf, i64 *nj, f32 *x, f32 *y, f32 *z, c64 *c, int *iflag, + f32 *eps, i64 *nk, f32 *s, f32 *t, f32 *u, c64 *f, finufft_opts *o, + int *ier) { *ier = finufftf3d3many(*ntransf, *nj, x, y, z, c, *iflag, *eps, *nk, s, t, u, f, o); } diff --git a/include/finufft.h b/include/finufft.h index 18eab5921..ab1da910d 100644 --- a/include/finufft.h +++ b/include/finufft.h @@ -2,8 +2,7 @@ // This contains both single and double precision user-facing commands. // "macro-safe" rewrite, including the plan object, Barnett 5/21/22-6/7/22. -// They will clobber any prior macros starting FINUFFT*, so in the lib/test -// sources finufft.h must be included before defs.h +// They will clobber any prior macros starting FINUFFT*. /* Devnotes. A) Two precisions done by including the "either precision" headers twice. diff --git a/include/finufft/defs.h b/include/finufft/defs.h deleted file mode 100644 index 633a7c1f3..000000000 --- a/include/finufft/defs.h +++ /dev/null @@ -1,137 +0,0 @@ -// Library private definitions & macros; also used by some test routines. -// If SINGLE defined, chooses single prec, otherwise double prec. -// Must be #included *after* finufft.h which clobbers FINUFFT* macros -// (see discussion near line 145 of this file). - -// Split out by Joakim Anden, Alex Barnett 9/20/18-9/24/18. -// Merged in dataTypes, private/public header split, clean. Barnett 6/7/22. -// finufft_plan_s made private, Wenda's idea. Barnett 8/8/22. - -/* Devnotes: - 1) Only need work for C++ since that's how compiled, including f_plan_s. - (But we use C-style templating, following fftw, etc.) -*/ - -#ifndef DEFS_H -#define DEFS_H - -// public header gives access to f_opts, f_spread_opts, f_plan... -// (and clobbers FINUFFT* macros; watch out!) -#include -#include -#include - -// --------------- Private data types for compilation in either prec --------- -// Devnote: must match those in relevant prec of public finufft.h interface! - -// All indexing in library that potentially can exceed 2^31 uses 64-bit signed. -// This includes all calling arguments (eg M,N) that could be huge someday. -// Precision-independent real and complex types, for private lib/test compile -#ifdef SINGLE -using FLT = float; -#else -using FLT = double; -#endif -#include // we define C++ complex type only -using CPX = std::complex; - -// -------------- Math consts (not in math.h) and useful math macros ---------- -#include - -// either-precision unit imaginary number... -#define IMA (CPX(0.0, 1.0)) - -// MR: In the longer term I suggest to move -// away from M_PI, which was never part of the standard. -// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi -// if no namespaces are used? -// In C++20 these constants will be part of the language, and the problem will go away. -#ifndef M_PI // Windows apparently doesn't have this const -#define M_PI 3.14159265358979329 -#endif -#define M_1_2PI 0.159154943091895336 -#define M_2PI 6.28318530717958648 -// to avoid mixed precision operators in eg i*pi, an either-prec PI... -#define PI FLT(M_PI) - -// Random numbers: crappy unif random number generator in [0,1). -// These macros should probably be replaced by modern C++ std lib or random123. -// (RAND_MAX is in stdlib.h) -#include -static inline FLT rand01 [[maybe_unused]] () { return FLT(rand()) / FLT(RAND_MAX); } -// unif[-1,1]: -static inline FLT randm11 [[maybe_unused]] () { return 2 * rand01() - FLT(1); } -// complex unif[-1,1] for Re and Im: -static inline CPX crandm11 [[maybe_unused]] () { return randm11() + IMA * randm11(); } - -// Thread-safe seed-carrying versions of above (x is ptr to seed)... -// MR: we have to leave those as macros for now, as "rand_r" is deprecated -// and apparently no longer available on Windows. -#if 1 -#define rand01r(x) ((FLT)rand_r(x) / (FLT)RAND_MAX) -// unif[-1,1]: -#define randm11r(x) (2 * rand01r(x) - (FLT)1.0) -// complex unif[-1,1] for Re and Im: -#define crandm11r(x) (randm11r(x) + IMA * randm11r(x)) -#else -static inline FLT rand01r [[maybe_unused]] (unsigned int *x) { - return FLT(rand_r(x)) / FLT(RAND_MAX); -} -// unif[-1,1]: -static inline FLT randm11r [[maybe_unused]] (unsigned int *x) { - return 2 * rand01r(x) - FLT(1); -} -// complex unif[-1,1] for Re and Im: -static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) { - return randm11r(x) + IMA * randm11r(x); -} -#endif - -// Prec-switching name macros (respond to SINGLE), used in lib & test sources -// and the plan object below. -// Note: crucially, these are now indep of macros used to gen public finufft.h! -// However, some overlap in name (FINUFFTIFY*, FINUFFT_PLAN*), meaning -// finufft.h cannot be included after defs.h since it undefs these overlaps :( -#ifdef SINGLE -// a macro to prepend finufft or finufftf to a string without a space. -// The 2nd level of indirection is needed for safety, see: -// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting -#define FINUFFTIFY_UNSAFE(x) finufftf##x -#else -#define FINUFFTIFY_UNSAFE(x) finufft##x -#endif -#define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x) -// Now use the above tool to set up 2020-style macros used in tester source... -#define FINUFFT_PLAN FINUFFTIFY(_plan) -#define FINUFFT_PLAN_S FINUFFTIFY(_plan_s) -#define FINUFFT_DEFAULT_OPTS FINUFFTIFY(_default_opts) -#define FINUFFT_MAKEPLAN FINUFFTIFY(_makeplan) -#define FINUFFT_SETPTS FINUFFTIFY(_setpts) -#define FINUFFT_EXECUTE FINUFFTIFY(_execute) -#define FINUFFT_DESTROY FINUFFTIFY(_destroy) -#define FINUFFT1D1 FINUFFTIFY(1d1) -#define FINUFFT1D2 FINUFFTIFY(1d2) -#define FINUFFT1D3 FINUFFTIFY(1d3) -#define FINUFFT2D1 FINUFFTIFY(2d1) -#define FINUFFT2D2 FINUFFTIFY(2d2) -#define FINUFFT2D3 FINUFFTIFY(2d3) -#define FINUFFT3D1 FINUFFTIFY(3d1) -#define FINUFFT3D2 FINUFFTIFY(3d2) -#define FINUFFT3D3 FINUFFTIFY(3d3) -#define FINUFFT1D1MANY FINUFFTIFY(1d1many) -#define FINUFFT1D2MANY FINUFFTIFY(1d2many) -#define FINUFFT1D3MANY FINUFFTIFY(1d3many) -#define FINUFFT2D1MANY FINUFFTIFY(2d1many) -#define FINUFFT2D2MANY FINUFFTIFY(2d2many) -#define FINUFFT2D3MANY FINUFFTIFY(2d3many) -#define FINUFFT3D1MANY FINUFFTIFY(3d1many) -#define FINUFFT3D2MANY FINUFFTIFY(3d2many) -#define FINUFFT3D3MANY FINUFFTIFY(3d3many) - -// -------- FINUFFT's plan object, prec-switching version ------------------ -// NB: now private (the public C++ or C etc user sees an opaque pointer to it) - -#include // (must come after complex.h) -struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T {}; - -#endif // DEFS_H diff --git a/include/finufft/dirft.h b/include/finufft/dirft.h index 5d13265a4..2449d864e 100644 --- a/include/finufft/dirft.h +++ b/include/finufft/dirft.h @@ -1,22 +1,36 @@ #ifndef DIRFT_H #define DIRFT_H -#include +#include -void dirft1d1(BIGINT nj, FLT *x, CPX *c, int isign, BIGINT ms, CPX *f); -void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f); -void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f); +template +void dirft1d1(BIGINT nj, T *x, std::complex *c, int isign, BIGINT ms, + std::complex *f); +template +void dirft1d2(BIGINT nj, T *x, std::complex *c, int iflag, BIGINT ms, + std::complex *f); +template +void dirft1d3(BIGINT nj, T *x, std::complex *c, int iflag, BIGINT nk, T *s, + std::complex *f); -void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f); -void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f); -void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t, - CPX *f); +template +void dirft2d1(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT ms, BIGINT mt, + std::complex *f); +template +void dirft2d2(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT ms, BIGINT mt, + std::complex *f); +template +void dirft2d3(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT nk, T *s, T *t, + std::complex *f); -void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, - BIGINT mu, CPX *f); -void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, - BIGINT mu, CPX *f); -void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s, - FLT *t, FLT *u, CPX *f); +template +void dirft3d1(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT ms, + BIGINT mt, BIGINT mu, std::complex *f); +template +void dirft3d2(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT ms, + BIGINT mt, BIGINT mu, std::complex *f); +template +void dirft3d3(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT nk, T *s, + T *t, T *u, std::complex *f); #endif diff --git a/include/finufft/fft.h b/include/finufft/fft.h index c6d5de7a5..1fd8d1635 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -10,6 +10,9 @@ template class Finufft_FFT_plan { public: [[maybe_unused]] Finufft_FFT_plan(void (*)(void *) = nullptr, void (*)(void *) = nullptr, void * = nullptr) {} + // deleting these operations to be consistent with the FFTW plans (seel below) + Finufft_FFT_plan(const Finufft_FFT_plan &) = delete; + Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete; [[maybe_unused]] void plan(const std::vector & /*dims*/, size_t /*batchSize*/, std::complex * /*ptr*/, int /*sign*/, int /*options*/, int /*nthreads*/) {} @@ -63,11 +66,15 @@ template<> struct Finufft_FFT_plan { #endif unlock(); } + // we have raw pointers in the object (the FFTW plan). + // If we allow copying those, we end up destroying the plans multiple times. + Finufft_FFT_plan(const Finufft_FFT_plan &) = delete; [[maybe_unused]] ~Finufft_FFT_plan() { lock(); fftwf_destroy_plan(plan_); unlock(); } + Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete; void plan [[maybe_unused]] (const std::vector &dims, size_t batchSize, @@ -131,11 +138,13 @@ template<> struct Finufft_FFT_plan { #endif unlock(); } + Finufft_FFT_plan(const Finufft_FFT_plan &) = delete; [[maybe_unused]] ~Finufft_FFT_plan() { lock(); fftw_destroy_plan(plan_); unlock(); } + Finufft_FFT_plan &operator=(const Finufft_FFT_plan &) = delete; void plan [[maybe_unused]] (const std::vector &dims, size_t batchSize, diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index de2f2dab9..cdfa4ab33 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -95,6 +95,17 @@ inline constexpr BIGINT MAX_NF = BIGINT(1e12); // values for M = nj (also nk in type 3)... inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); +// MR: In the longer term I suggest to move +// away from M_PI, which was never part of the standard. +// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi +// if no namespaces are used? +// In C++20 these constants will be part of the language, and the problem will go away. +#ifndef M_PI // Windows apparently doesn't have this const +#define M_PI 3.14159265358979329 +#endif +#define M_1_2PI 0.159154943091895336 +#define M_2PI 6.28318530717958648 + // ----- OpenMP macros which also work when omp not present ----- // Allows compile-time switch off of openmp, so compilation without any openmp // is done (Note: _OPENMP is automatically set by -fopenmp compile flag) @@ -138,7 +149,8 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ // These default and delete specifications just state the obvious, // but are here to silence compiler warnings. - FINUFFT_PLAN_T() = default; + FINUFFT_PLAN_T(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, TF tol, + finufft_opts *opts, int &ier); // Copy construction and assignent are already deleted implicitly // because of the unique_ptr member. FINUFFT_PLAN_T(const FINUFFT_PLAN_T &) = delete; @@ -189,7 +201,8 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ std::vector Sp, Tp, Up; // internal primed targs (s'_k, etc), // allocated type3params t3P; // groups together type 3 shift, scale, phase, parameters - FINUFFT_PLAN_T *innerT2plan = nullptr; // ptr used for type 2 in step 2 of type 3 + std::unique_ptr> innerT2plan; // ptr used for type 2 in step 2 of + // type 3 // other internal structs std::unique_ptr> fftPlan; @@ -204,10 +217,5 @@ void finufft_default_opts_t(finufft_opts *o); template int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts); -template -int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, - TF *s, TF *t, TF *u); -template -int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); #endif // FINUFFT_CORE_H diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h index 8a83af3ce..04c4734ba 100644 --- a/include/finufft/spreadinterp.h +++ b/include/finufft/spreadinterp.h @@ -1,12 +1,13 @@ // Defines interface to spreading/interpolation code. -// Devnotes: see defs.h for definition of MAX_NSPREAD (as of 9/24/18). +// Devnotes: see finufft_core.h for definition of MAX_NSPREAD // RESCALE macro moved to spreadinterp.cpp, 7/15/20. // finufft_spread_opts renamed 6/7/22. #ifndef SPREADINTERP_H #define SPREADINTERP_H +#include #include /* Bitwise debugging timing flag (TF) defs; see finufft_spread_opts.flags. diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index bdd4cf147..e8e80e9f8 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -11,21 +11,120 @@ // for omp rand filling #define TEST_RANDCHUNK 1000000 -// the public interface: since this clobbers FINUFFT* macros, must be included -// *before* private defs.h... +// the public interface #include -// convenient private finufft internals (must come after finufft.h) +// convenient private finufft internals +#include #include -// prec-switching (via SINGLE) to set up FLT, CPX, BIGINT, FINUFFT1D1, etc... -#include +#include + +// --------------- Private data types for compilation in either prec --------- +// Devnote: must match those in relevant prec of public finufft.h interface! + +// All indexing in library that potentially can exceed 2^31 uses 64-bit signed. +// This includes all calling arguments (eg M,N) that could be huge someday. + +// Precision-independent real and complex types, for private lib/test compile +#ifdef SINGLE +using FLT = float; +#else +using FLT = double; +#endif +#include // we define C++ complex type only +using CPX = std::complex; + +// -------------- Math consts (not in math.h) and useful math macros ---------- +#include + +// either-precision unit imaginary number... +#define IMA (CPX(0.0, 1.0)) + +// to avoid mixed precision operators in eg i*pi, an either-prec PI... +#define PI FLT(M_PI) + +// Random numbers: crappy unif random number generator in [0,1). +// These macros should probably be replaced by modern C++ std lib or random123. +// (RAND_MAX is in stdlib.h) +#include +static inline FLT rand01 [[maybe_unused]] () { return FLT(rand()) / FLT(RAND_MAX); } +// unif[-1,1]: +static inline FLT randm11 [[maybe_unused]] () { return 2 * rand01() - FLT(1); } +// complex unif[-1,1] for Re and Im: +static inline CPX crandm11 [[maybe_unused]] () { return randm11() + IMA * randm11(); } + +// Thread-safe seed-carrying versions of above (x is ptr to seed)... +// MR: we have to leave those as macros for now, as "rand_r" is deprecated +// and apparently no longer available on Windows. +#if 1 +#define rand01r(x) ((FLT)rand_r(x) / (FLT)RAND_MAX) +// unif[-1,1]: +#define randm11r(x) (2 * rand01r(x) - (FLT)1.0) +// complex unif[-1,1] for Re and Im: +#define crandm11r(x) (randm11r(x) + IMA * randm11r(x)) +#else +static inline FLT rand01r [[maybe_unused]] (unsigned int *x) { + return FLT(rand_r(x)) / FLT(RAND_MAX); +} +// unif[-1,1]: +static inline FLT randm11r [[maybe_unused]] (unsigned int *x) { + return 2 * rand01r(x) - FLT(1); +} +// complex unif[-1,1] for Re and Im: +static inline CPX crandm11r [[maybe_unused]] (unsigned int *x) { + return randm11r(x) + IMA * randm11r(x); +} +#endif + +// Prec-switching name macros (respond to SINGLE), used in lib & test sources +// and the plan object below. +// Note: crucially, these are now indep of macros used to gen public finufft.h! +#ifdef SINGLE +// a macro to prepend finufft or finufftf to a string without a space. +// The 2nd level of indirection is needed for safety, see: +// https://isocpp.org/wiki/faq/misc-technical-issues#macros-with-token-pasting +#define FINUFFTIFY_UNSAFE(x) finufftf##x +#else +#define FINUFFTIFY_UNSAFE(x) finufft##x +#endif +#define FINUFFTIFY(x) FINUFFTIFY_UNSAFE(x) +// Now use the above tool to set up 2020-style macros used in tester source... +#define FINUFFT_PLAN FINUFFTIFY(_plan) +#define FINUFFT_PLAN_S FINUFFTIFY(_plan_s) +#define FINUFFT_DEFAULT_OPTS FINUFFTIFY(_default_opts) +#define FINUFFT_MAKEPLAN FINUFFTIFY(_makeplan) +#define FINUFFT_SETPTS FINUFFTIFY(_setpts) +#define FINUFFT_EXECUTE FINUFFTIFY(_execute) +#define FINUFFT_DESTROY FINUFFTIFY(_destroy) +#define FINUFFT1D1 FINUFFTIFY(1d1) +#define FINUFFT1D2 FINUFFTIFY(1d2) +#define FINUFFT1D3 FINUFFTIFY(1d3) +#define FINUFFT2D1 FINUFFTIFY(2d1) +#define FINUFFT2D2 FINUFFTIFY(2d2) +#define FINUFFT2D3 FINUFFTIFY(2d3) +#define FINUFFT3D1 FINUFFTIFY(3d1) +#define FINUFFT3D2 FINUFFTIFY(3d2) +#define FINUFFT3D3 FINUFFTIFY(3d3) +#define FINUFFT1D1MANY FINUFFTIFY(1d1many) +#define FINUFFT1D2MANY FINUFFTIFY(1d2many) +#define FINUFFT1D3MANY FINUFFTIFY(1d3many) +#define FINUFFT2D1MANY FINUFFTIFY(2d1many) +#define FINUFFT2D2MANY FINUFFTIFY(2d2many) +#define FINUFFT2D3MANY FINUFFTIFY(2d3many) +#define FINUFFT3D1MANY FINUFFTIFY(3d1many) +#define FINUFFT3D2MANY FINUFFTIFY(3d2many) +#define FINUFFT3D3MANY FINUFFTIFY(3d3many) + +// -------- FINUFFT's plan object, prec-switching version ------------------ +// NB: now private (the public C++ or C etc user sees an opaque pointer to it) + +#include +struct FINUFFT_PLAN_S : public FINUFFT_PLAN_T {}; // std stuff for tester src #include #include #include -#include -#include #include #endif // TEST_DEFS_H diff --git a/include/finufft/utils.h b/include/finufft/utils.h index 040f60543..c550a8bb9 100644 --- a/include/finufft/utils.h +++ b/include/finufft/utils.h @@ -14,51 +14,44 @@ namespace utils { // ahb's low-level array helpers template -FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, std::complex *a, - std::complex *b) +FINUFFT_EXPORT T FINUFFT_CDECL relerrtwonorm(BIGINT n, const std::complex *a, + const std::complex *b) // ||a-b||_2 / ||a||_2 { T err = 0.0, nrm = 0.0; for (BIGINT m = 0; m < n; ++m) { - nrm += real(conj(a[m]) * a[m]); - std::complex diff = a[m] - b[m]; - err += real(conj(diff) * diff); + nrm += std::norm(a[m]); + err += std::norm(a[m] - b[m]); } return sqrt(err / nrm); } template -FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, std::complex *a, - std::complex *b) +FINUFFT_EXPORT T FINUFFT_CDECL errtwonorm(BIGINT n, const std::complex *a, + const std::complex *b) // ||a-b||_2 { T err = 0.0; // compute error 2-norm - for (BIGINT m = 0; m < n; ++m) { - std::complex diff = a[m] - b[m]; - err += real(conj(diff) * diff); - } + for (BIGINT m = 0; m < n; ++m) err += std::norm(a[m] - b[m]); return sqrt(err); } template -FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, std::complex *a) +FINUFFT_EXPORT T FINUFFT_CDECL twonorm(BIGINT n, const std::complex *a) // ||a||_2 { T nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) nrm += real(conj(a[m]) * a[m]); + for (BIGINT m = 0; m < n; ++m) nrm += std::norm(a[m]); return sqrt(nrm); } template -FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, std::complex *a) +FINUFFT_EXPORT T FINUFFT_CDECL infnorm(BIGINT n, const std::complex *a) // ||a||_infty { T nrm = 0.0; - for (BIGINT m = 0; m < n; ++m) { - T aa = real(conj(a[m]) * a[m]); - if (aa > nrm) nrm = aa; - } + for (BIGINT m = 0; m < n; ++m) nrm = std::max(nrm, std::norm(a[m])); return sqrt(nrm); } template -FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi) +FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, const T *a, T *lo, T *hi) // With a a length-n array, writes out min(a) to lo and max(a) to hi, // so that all a values lie in [lo,hi]. // If n==0, lo and hi are not finite. @@ -71,10 +64,10 @@ FINUFFT_EXPORT void FINUFFT_CDECL arrayrange(BIGINT n, T *a, T *lo, T *hi) } } template -FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, T *a, T *w, T *c) +FINUFFT_EXPORT void FINUFFT_CDECL arraywidcen(BIGINT n, const T *a, T *w, T *c) // Writes out w = half-width and c = center of an interval enclosing all a[n]'s // Only chooses a nonzero center if this increases w by less than fraction -// ARRAYWIDCEN_GROWFRAC defined in defs.h. +// ARRAYWIDCEN_GROWFRAC defined in finufft_core.h. // This prevents rephasings which don't grow nf by much. 6/8/17 // If n==0, w and c are not finite. { diff --git a/include/finufft_eitherprec.h b/include/finufft_eitherprec.h index 3f0a7d95c..fa698cec2 100644 --- a/include/finufft_eitherprec.h +++ b/include/finufft_eitherprec.h @@ -4,8 +4,7 @@ /* Devnotes. 1) Since everything here is exposed to the public interface, macros must be safe, eg FINUFFTsomething. - 2) They will clobber any prior macros starting FINUFFT*, so in the lib/test - sources finufft.h must be included before defs.h + 2) They will clobber any prior macros starting FINUFFT* 3) for debug of header macros, see finufft.h */ @@ -82,7 +81,7 @@ extern "C" { typedef struct FINUFFT_PLAN_S *FINUFFT_PLAN; // ------------------ the guru interface ------------------------------------ -// (sources in finufft.cpp) +// (sources in c_interface.cpp) FINUFFT_EXPORT void FINUFFT_CDECL FINUFFTIFY(_default_opts)(finufft_opts *o); FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_makeplan)( @@ -96,7 +95,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_execute)( FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(_destroy)(FINUFFT_PLAN plan); // ----------------- the 18 simple interfaces ------------------------------- -// (sources in simpleinterfaces.cpp) +// (sources in c_interface.cpp) FINUFFT_EXPORT int FINUFFT_CDECL FINUFFTIFY(1d1)( FINUFFT_BIGINT nj, FINUFFT_FLT *xj, FINUFFT_CPX *cj, int iflag, FINUFFT_FLT eps, diff --git a/makefile b/makefile index c9754a376..7a715a7d5 100644 --- a/makefile +++ b/makefile @@ -66,7 +66,7 @@ XSIMD_DIR := $(DEPS_ROOT)/xsimd # DUCC sources optional dependency repo DUCC_URL := https://gitlab.mpcdf.mpg.de/mtr/ducc.git -DUCC_VERSION := ducc0_0_34_0 +DUCC_VERSION := ducc0_0_35_0 DUCC_DIR := $(DEPS_ROOT)/ducc # this dummy file used as empty target by make... DUCC_COOKIE := $(DUCC_DIR)/.finufft_has_ducc @@ -137,7 +137,7 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB) SOBJS = src/utils.o src/spreadinterp.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) -OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/simpleinterfaces.o fortran/finufftfort.o $(DUCC_OBJS) +OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS) .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean @@ -181,12 +181,10 @@ HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) $(CC) -c $(CFLAGS) $< -o $@ %.o: %.f $(FC) -c $(FFLAGS) $< -o $@ -%_32.o: %.f - $(FC) -DSINGLE -c $(FFLAGS) $< -o $@ # spreadinterp include auto-generated code, xsimd header-only dependency; # if FFT=DUCC also setup ducc with fft.h dependency on $(DUCC_SETUP)... -# Note src/spreadinterp.cpp includes finufft/defs.h which includes finufft/fft.h +# Note src/spreadinterp.cpp includes finufft/finufft_core.h which includes finufft/fft.h # so fftw/ducc header needed for spreadinterp, though spreadinterp should not # depend on fftw/ducc directly? include/finufft/fft.h: $(DUCC_SETUP) diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index 5e27289d8..ded866ba0 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -1,6 +1,6 @@ // public header #include "finufft.h" -#include "finufft/defs.h" +#include "finufft/test_defs.h" // private access to timer #include "finufft/utils.h" diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp index d30626007..8c35828bd 100644 --- a/perftest/spreadtestnd.cpp +++ b/perftest/spreadtestnd.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include #include diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp index 14aad3420..9787d86b0 100644 --- a/perftest/spreadtestndall.cpp +++ b/perftest/spreadtestndall.cpp @@ -1,5 +1,5 @@ -#include #include +#include #include #include diff --git a/src/simpleinterfaces.cpp b/src/c_interface.cpp similarity index 55% rename from src/simpleinterfaces.cpp rename to src/c_interface.cpp index d2606ee6b..6f7deb61b 100644 --- a/src/simpleinterfaces.cpp +++ b/src/c_interface.cpp @@ -12,8 +12,6 @@ using namespace std; As of v1.2 these simply invoke the guru interface, through a helper layer. See ../docs/usage.rst or http://finufft.readthedocs.io for documentation all routines here. - This compiles in either double or single precision (based on -DSINGLE), - producing functions finufft?d?{many} or finufftf?1?{many} respectively. Authors: Andrea Malleo and Alex Barnett, 2019-2020. Safe namespacing, Barnett, May 2022. @@ -42,20 +40,18 @@ int finufftf_makeplan(int type, int dim, const i64 *n_modes, int iflag, int ntra int finufft_setpts(finufft_plan p, i64 nj, f64 *xj, f64 *yj, f64 *zj, i64 nk, f64 *s, f64 *t, f64 *u) { - return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, zj, - nk, s, t, u); + return reinterpret_cast *>(p)->setpts(nj, xj, yj, zj, nk, s, t, u); } int finufftf_setpts(finufftf_plan p, i64 nj, f32 *xj, f32 *yj, f32 *zj, i64 nk, f32 *s, f32 *t, f32 *u) { - return finufft_setpts_t(reinterpret_cast *>(p), nj, xj, yj, zj, - nk, s, t, u); + return reinterpret_cast *>(p)->setpts(nj, xj, yj, zj, nk, s, t, u); } int finufft_execute(finufft_plan p, c128 *cj, c128 *fk) { - return finufft_execute_t(reinterpret_cast *>(p), cj, fk); + return reinterpret_cast *>(p)->execute(cj, fk); } int finufftf_execute(finufftf_plan p, c64 *cj, c64 *fk) { - return finufft_execute_t(reinterpret_cast *>(p), cj, fk); + return reinterpret_cast *>(p)->execute(cj, fk); } int finufft_destroy(finufft_plan p) @@ -68,7 +64,6 @@ int finufft_destroy(finufft_plan p) return 1; delete reinterpret_cast *>(p); - p = nullptr; return 0; // success } int finufftf_destroy(finufftf_plan p) @@ -81,19 +76,15 @@ int finufftf_destroy(finufftf_plan p) return 1; delete reinterpret_cast *>(p); - p = nullptr; return 0; // success } // Helper layer ........................................................... -namespace finufft { -namespace common { - template -static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T *zj, - std::complex *cj, int iflag, T eps, - const std::array &n_modes, i64 nk, T *s, T *t, T *u, - std::complex *fk, finufft_opts *popts) +static int guru(int n_dims, int type, int n_transf, i64 nj, const std::array &xyz, + std::complex *cj, int iflag, T eps, const std::array &n_modes, + i64 nk, const std::array &stu, std::complex *fk, + finufft_opts *popts) // Helper layer between simple interfaces (with opts) and the guru functions. // Author: Andrea Malleo, 2019. { @@ -107,14 +98,14 @@ static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T return ier; } - int ier2 = finufft_setpts_t(plan, nj, xj, yj, zj, nk, s, t, u); + int ier2 = plan->setpts(nj, xyz[0], xyz[1], xyz[2], nk, stu[0], stu[1], stu[2]); if (ier2 > 1) { fprintf(stderr, "FINUFFT invokeGuru: setpts error (ier=%d)!\n", ier2); delete plan; return ier2; } - int ier3 = finufft_execute_t(plan, cj, fk); + int ier3 = plan->execute(cj, fk); if (ier3 > 1) { fprintf(stderr, "FINUFFT invokeGuru: execute error (ier=%d)!\n", ier3); delete plan; @@ -125,258 +116,179 @@ static int guruCall(int n_dims, int type, int n_transf, i64 nj, T *xj, T *yj, T return max(max(ier, ier2), ier3); // in case any one gave a (positive!) warning } -} // namespace common -} // namespace finufft - -using namespace finufft::common; - // Dimension 1111111111111111111111111111111111111111111111111111111111111111 int finufft1d1many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, - c128 *fk, finufft_opts *opts) -// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst -{ - return guruCall(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); + c128 *fk, finufft_opts *opts) { + return guru(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts); } int finufftf1d1many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, - c64 *fk, finufft_opts *opts) -// Type-1 1D complex nonuniform FFT for many vectors. See ../docs/usage.rst -{ - return guruCall(1, 1, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); + c64 *fk, finufft_opts *opts) { + return guru(1, 1, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts); } int finufft1d1(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk, - finufft_opts *opts) -// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufft1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int finufftf1d1(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk, - finufft_opts *opts) -// Type-1 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufftf1d1many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int finufft1d2many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, - c128 *fk, finufft_opts *opts) -// Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); + c128 *fk, finufft_opts *opts) { + return guru(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts); } int finufftf1d2many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, - c64 *fk, finufft_opts *opts) -// Type-2 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(1, 2, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {ms, 1, 1}, 0, nullptr, nullptr, nullptr, fk, opts); + c64 *fk, finufft_opts *opts) { + return guru(1, 2, n_transf, nj, {xj}, cj, iflag, eps, {ms, 1, 1}, 0, {}, fk, opts); } int finufft1d2(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 ms, c128 *fk, - finufft_opts *opts) -// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufft1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int finufftf1d2(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 ms, c64 *fk, - finufft_opts *opts) -// Type-2 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufftf1d2many(1, nj, xj, cj, iflag, eps, ms, fk, opts); } int finufft1d3many(int n_transf, i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk, - f64 *s, c128 *fk, finufft_opts *opts) -// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); + f64 *s, c128 *fk, finufft_opts *opts) { + return guru(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk, + opts); } int finufftf1d3many(int n_transf, i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk, - f32 *s, c64 *fk, finufft_opts *opts) -// Type-3 1D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(1, 3, n_transf, nj, xj, nullptr, nullptr, cj, iflag, eps, - {0, 0, 0}, nk, s, nullptr, nullptr, fk, opts); + f32 *s, c64 *fk, finufft_opts *opts) { + return guru(1, 3, n_transf, nj, {xj}, cj, iflag, eps, {0, 0, 0}, nk, {s}, fk, + opts); } int finufft1d3(i64 nj, f64 *xj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s, c128 *fk, - finufft_opts *opts) -// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufft1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } int finufftf1d3(i64 nj, f32 *xj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s, c64 *fk, - finufft_opts *opts) -// Type-3 1D complex nonuniform FFT. See ../docs/usage.rst -{ + finufft_opts *opts) { return finufftf1d3many(1, nj, xj, cj, iflag, eps, nk, s, fk, opts); } // Dimension 22222222222222222222222222222222222222222222222222222222222222222 int finufft2d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps, - i64 ms, i64 mt, c128 *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0, - nullptr, nullptr, nullptr, fk, opts); + i64 ms, i64 mt, c128 *fk, finufft_opts *opts) { + return guru(2, 1, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk, + opts); } int finufftf2d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps, - i64 ms, i64 mt, c64 *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 1, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0, - nullptr, nullptr, nullptr, fk, opts); + i64 ms, i64 mt, c64 *fk, finufft_opts *opts) { + return guru(2, 1, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk, + opts); } int finufft2d1(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt, - c128 *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst -{ + c128 *fk, finufft_opts *opts) { return finufft2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int finufftf2d1(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt, - c64 *fk, finufft_opts *opts) -// Type-1 2D complex nonuniform FFT. See ../docs/usage.rst -{ + c64 *fk, finufft_opts *opts) { return finufftf2d1many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int finufft2d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *c, int iflag, f64 eps, - i64 ms, i64 mt, c128 *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0, - nullptr, nullptr, nullptr, fk, opts); + i64 ms, i64 mt, c128 *fk, finufft_opts *opts) { + return guru(2, 2, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk, + opts); } int finufftf2d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *c, int iflag, f32 eps, - i64 ms, i64 mt, c64 *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 2, n_transf, nj, xj, yj, nullptr, c, iflag, eps, {ms, mt, 1}, 0, - nullptr, nullptr, nullptr, fk, opts); + i64 ms, i64 mt, c64 *fk, finufft_opts *opts) { + return guru(2, 2, n_transf, nj, {xj, yj}, c, iflag, eps, {ms, mt, 1}, 0, {}, fk, + opts); } int finufft2d2(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 ms, i64 mt, - c128 *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst -{ + c128 *fk, finufft_opts *opts) { return finufft2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int finufftf2d2(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 ms, i64 mt, - c64 *fk, finufft_opts *opts) -// Type-2 2D complex nonuniform FFT. See ../docs/usage.rst -{ + c64 *fk, finufft_opts *opts) { return finufftf2d2many(1, nj, xj, yj, cj, iflag, eps, ms, mt, fk, opts); } int finufft2d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, - i64 nk, f64 *s, f64 *t, c128 *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk, - s, t, nullptr, fk, opts); + i64 nk, f64 *s, f64 *t, c128 *fk, finufft_opts *opts) { + return guru(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, {s, t}, + fk, opts); } int finufftf2d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, - i64 nk, f32 *s, f32 *t, c64 *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(2, 3, n_transf, nj, xj, yj, nullptr, cj, iflag, eps, {0, 0, 0}, nk, - s, t, nullptr, fk, opts); + i64 nk, f32 *s, f32 *t, c64 *fk, finufft_opts *opts) { + return guru(2, 3, n_transf, nj, {xj, yj}, cj, iflag, eps, {0, 0, 0}, nk, {s, t}, + fk, opts); } int finufft2d3(i64 nj, f64 *xj, f64 *yj, c128 *cj, int iflag, f64 eps, i64 nk, f64 *s, - f64 *t, c128 *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst -{ + f64 *t, c128 *fk, finufft_opts *opts) { return finufft2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } int finufftf2d3(i64 nj, f32 *xj, f32 *yj, c64 *cj, int iflag, f32 eps, i64 nk, f32 *s, - f32 *t, c64 *fk, finufft_opts *opts) -// Type-3 2D complex nonuniform FFT. See ../docs/usage.rst -{ + f32 *t, c64 *fk, finufft_opts *opts) { return finufftf2d3many(1, nj, xj, yj, cj, iflag, eps, nk, s, t, fk, opts); } // Dimension 3333333333333333333333333333333333333333333333333333333333333333 int finufft3d1many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, - f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0, - nullptr, nullptr, nullptr, fk, opts); + f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) { + return guru(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {}, + fk, opts); } int finufftf3d1many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, - f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 1, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0, - nullptr, nullptr, nullptr, fk, opts); + f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) { + return guru(3, 1, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {}, + fk, opts); } int finufft3d1(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms, - i64 mt, i64 mu, c128 *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst -{ + i64 mt, i64 mu, c128 *fk, finufft_opts *opts) { return finufft3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int finufftf3d1(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms, - i64 mt, i64 mu, c64 *fk, finufft_opts *opts) -// Type-1 3D complex nonuniform FFT. See ../docs/usage.rst -{ + i64 mt, i64 mu, c64 *fk, finufft_opts *opts) { return finufftf3d1many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int finufft3d2many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, - f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0, - nullptr, nullptr, nullptr, fk, opts); + f64 eps, i64 ms, i64 mt, i64 mu, c128 *fk, finufft_opts *opts) { + return guru(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {}, + fk, opts); } int finufftf3d2many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, - f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 2, n_transf, nj, xj, yj, zj, cj, iflag, eps, {ms, mt, mu}, 0, - nullptr, nullptr, nullptr, fk, opts); + f32 eps, i64 ms, i64 mt, i64 mu, c64 *fk, finufft_opts *opts) { + return guru(3, 2, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {ms, mt, mu}, 0, {}, + fk, opts); } int finufft3d2(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 ms, - i64 mt, i64 mu, c128 *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst -{ + i64 mt, i64 mu, c128 *fk, finufft_opts *opts) { return finufft3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int finufftf3d2(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 ms, - i64 mt, i64 mu, c64 *fk, finufft_opts *opts) -// Type-2 3D complex nonuniform FFT. See ../docs/usage.rst -{ + i64 mt, i64 mu, c64 *fk, finufft_opts *opts) { return finufftf3d2many(1, nj, xj, yj, zj, cj, iflag, eps, ms, mt, mu, fk, opts); } int finufft3d3many(int n_transf, i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, - f64 eps, i64 nk, f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s, - t, u, fk, opts); + f64 eps, i64 nk, f64 *s, f64 *t, f64 *u, c128 *fk, + finufft_opts *opts) { + return guru(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t, u}, fk, opts); } int finufftf3d3many(int n_transf, i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, - f32 eps, i64 nk, f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT, many vectors. See ../docs/usage.rst -{ - return guruCall(3, 3, n_transf, nj, xj, yj, zj, cj, iflag, eps, {0, 0, 0}, nk, s, - t, u, fk, opts); + f32 eps, i64 nk, f32 *s, f32 *t, f32 *u, c64 *fk, + finufft_opts *opts) { + return guru(3, 3, n_transf, nj, {xj, yj, zj}, cj, iflag, eps, {0, 0, 0}, nk, + {s, t, u}, fk, opts); } int finufft3d3(i64 nj, f64 *xj, f64 *yj, f64 *zj, c128 *cj, int iflag, f64 eps, i64 nk, - f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst -{ + f64 *s, f64 *t, f64 *u, c128 *fk, finufft_opts *opts) { return finufft3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } int finufftf3d3(i64 nj, f32 *xj, f32 *yj, f32 *zj, c64 *cj, int iflag, f32 eps, i64 nk, - f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts) -// Type-3 3D complex nonuniform FFT. See ../docs/usage.rst -{ + f32 *s, f32 *t, f32 *u, c64 *fk, finufft_opts *opts) { return finufftf3d3many(1, nj, xj, yj, zj, cj, iflag, eps, nk, s, t, u, fk, opts); } diff --git a/src/finufft_core.cpp b/src/finufft.cpp similarity index 82% rename from src/finufft_core.cpp rename to src/finufft.cpp index e950e2160..2130cfd8a 100644 --- a/src/finufft_core.cpp +++ b/src/finufft.cpp @@ -26,7 +26,7 @@ using namespace finufft::quadrature; As of v1.2 these replace the old hand-coded separate 9 finufft?d?() functions and the two finufft2d?many() functions. The (now 18) simple C++ interfaces - are in simpleinterfaces.cpp. + are in c_interface.cpp. Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: @@ -65,10 +65,6 @@ Algorithm summaries taken from old finufft?d?() documentation, Feb-Jun 2017: Design notes for guru interface implementation: -* Since finufft_plan is C-compatible, we need to use malloc/free for its - allocatable arrays, keeping it quite low-level. We can't use std::vector - since that would only survive in the scope of each function. - * Thread-safety: FINUFFT plans are passed as pointers, so it has no global state apart from that associated with FFTW (and the did_fftw_init). */ @@ -93,8 +89,8 @@ static int set_nf_type12(BIGINT ms, const finufft_opts &opts, return 0; } else { fprintf(stderr, - "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting even a " - "malloc\n", + "[%s] nf=%.3g exceeds MAX_NF of %.3g, so exit without attempting " + "memory allocation\n", __func__, (double)*nf, (double)MAX_NF); return FINUFFT_ERR_MAXNALLOC; } @@ -540,64 +536,56 @@ void finufft_default_opts_t(finufft_opts *o) // PPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPPP template -int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, - TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) +FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, int iflag, + int ntrans_, TF tol_, finufft_opts *opts_, int &ier) + : type(type_), dim(dim_), ntrans(ntrans_), tol(tol_) // Populates the fields of finufft_plan which is pointed to by "pp". // opts is ptr to a finufft_opts to set options, or nullptr to use defaults. // For some of the fields (if "auto" selected) here choose the actual setting. // For types 1,2 allocates memory for internal working arrays, // evaluates spreading kernel coefficients, and instantiates the fftw_plan { - FINUFFT_PLAN_T *p; - p = new FINUFFT_PLAN_T; // allocate fresh plan struct - *pp = p; // pass out plan as ptr to plan struct - - if (!opts) // use default opts - finufft_default_opts_t(&(p->opts)); - else // or read from what's passed in - p->opts = *opts; // keep a deep copy; changing *opts now has no effect + if (!opts_) // use default opts + finufft_default_opts_t(&opts); + else // or read from what's passed in + opts = *opts_; // keep a deep copy; changing *opts now has no effect - if (p->opts.debug) // do a hello world + if (opts.debug) // do a hello world printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", __func__); - p->fftPlan = std::make_unique>( - p->opts.fftw_lock_fun, p->opts.fftw_unlock_fun, p->opts.fftw_lock_data); + fftPlan = std::make_unique>( + opts.fftw_lock_fun, opts.fftw_unlock_fun, opts.fftw_lock_data); if ((type != 1) && (type != 2) && (type != 3)) { fprintf(stderr, "[%s] Invalid type (%d), should be 1, 2 or 3.\n", __func__, type); - return FINUFFT_ERR_TYPE_NOTVALID; + throw int(FINUFFT_ERR_TYPE_NOTVALID); } if ((dim != 1) && (dim != 2) && (dim != 3)) { fprintf(stderr, "[%s] Invalid dim (%d), should be 1, 2 or 3.\n", __func__, dim); - return FINUFFT_ERR_DIM_NOTVALID; + throw int(FINUFFT_ERR_DIM_NOTVALID); } if (ntrans < 1) { fprintf(stderr, "[%s] ntrans (%d) should be at least 1.\n", __func__, ntrans); - return FINUFFT_ERR_NTRANS_NOTVALID; + throw int(FINUFFT_ERR_NTRANS_NOTVALID); } - if (!p->opts.fftw_lock_fun != !p->opts.fftw_unlock_fun) { + if (!opts.fftw_lock_fun != !opts.fftw_unlock_fun) { fprintf(stderr, "[%s] fftw_(un)lock functions should be both null or both set\n", __func__); - return FINUFFT_ERR_LOCK_FUNS_INVALID; - ; + throw int(FINUFFT_ERR_LOCK_FUNS_INVALID); } // get stuff from args... - p->type = type; - p->dim = dim; - p->ntrans = ntrans; - p->tol = tol; - p->fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input + fftSign = (iflag >= 0) ? 1 : -1; // clean up flag input - // choose overall # threads... + // choose overall # threads... #ifdef _OPENMP int ompmaxnthr = MY_OMP_GET_MAX_THREADS(); int nthr = ompmaxnthr; // default: use as many as OMP gives us // (the above could be set, or suggested set, to 1 for small enough problems...) - if (p->opts.nthreads > 0) { - nthr = p->opts.nthreads; // user override, now without limit - if (p->opts.showwarn && (nthr > ompmaxnthr)) + if (opts.nthreads > 0) { + nthr = opts.nthreads; // user override, now without limit + if (opts.showwarn && (nthr > ompmaxnthr)) fprintf(stderr, "%s warning: using opts.nthreads=%d, more than the %d OpenMP claims " "available; note large nthreads can be slower.\n", @@ -605,61 +593,61 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int } #else int nthr = 1; // always 1 thread (avoid segfault) - if (p->opts.nthreads > 1) + if (opts.nthreads > 1) fprintf(stderr, "%s warning: opts.nthreads=%d but library is single-threaded; ignoring!\n", - __func__, p->opts.nthreads); + __func__, opts.nthreads); #endif - p->opts.nthreads = nthr; // store actual # thr planned for + opts.nthreads = nthr; // store actual # thr planned for // (this sets/limits all downstream spread/interp, 1dkernel, and FFT thread counts...) // choose batchSize for types 1,2 or 3... (uses int ceil(b/a)=1+(b-1)/a trick) - if (p->opts.maxbatchsize == 0) { // logic to auto-set best batchsize - p->nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss - p->batchSize = 1 + (ntrans - 1) / p->nbatch; // then cut # thr in each b - } else { // batchSize override by user - p->batchSize = std::min(p->opts.maxbatchsize, ntrans); - p->nbatch = 1 + (ntrans - 1) / p->batchSize; // resulting # batches + if (opts.maxbatchsize == 0) { // logic to auto-set best batchsize + nbatch = 1 + (ntrans - 1) / nthr; // min # batches poss + batchSize = 1 + (ntrans - 1) / nbatch; // then cut # thr in each b + } else { // batchSize override by user + batchSize = std::min(opts.maxbatchsize, ntrans); + nbatch = 1 + (ntrans - 1) / batchSize; // resulting # batches } - if (p->opts.spread_thread == 0) p->opts.spread_thread = 2; // our auto choice - if (p->opts.spread_thread != 1 && p->opts.spread_thread != 2) { + if (opts.spread_thread == 0) opts.spread_thread = 2; // our auto choice + if (opts.spread_thread != 1 && opts.spread_thread != 2) { fprintf(stderr, "[%s] illegal opts.spread_thread!\n", __func__); - return FINUFFT_ERR_SPREAD_THREAD_NOTVALID; + throw int(FINUFFT_ERR_SPREAD_THREAD_NOTVALID); } - if (type != 3) { // read in user Fourier mode array sizes... - p->ms = n_modes[0]; - p->mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims - p->mu = (dim > 2) ? n_modes[2] : 1; - p->N = p->ms * p->mt * p->mu; // N = total # modes + if (type != 3) { // read in user Fourier mode array sizes... + ms = n_modes[0]; + mt = (dim > 1) ? n_modes[1] : 1; // leave as 1 for unused dims + mu = (dim > 2) ? n_modes[2] : 1; + N = ms * mt * mu; // N = total # modes } // heuristic to choose default upsampfac... (currently two poss) - if (p->opts.upsampfac == 0.0) { // indicates auto-choose - p->opts.upsampfac = 2.0; // default, and need for tol small - if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach - if (type == 3) // could move to setpts, more known? - p->opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT - else if ((dim == 1 && p->N > 10000000) || (dim == 2 && p->N > 300000) || - (dim == 3 && p->N > 3000000)) // type 1,2 heuristic cutoffs, double, - // typ tol, 12-core xeon - p->opts.upsampfac = 1.25; + if (opts.upsampfac == 0.0) { // indicates auto-choose + opts.upsampfac = 2.0; // default, and need for tol small + if (tol >= (TF)1E-9) { // the tol sigma=5/4 can reach + if (type == 3) // could move to setpts, more known? + opts.upsampfac = 1.25; // faster b/c smaller RAM & FFT + else if ((dim == 1 && N > 10000000) || (dim == 2 && N > 300000) || + (dim == 3 && N > 3000000)) // type 1,2 heuristic cutoffs, double, + // typ tol, 12-core xeon + opts.upsampfac = 1.25; } - if (p->opts.debug > 1) - printf("[%s] set auto upsampfac=%.2f\n", __func__, p->opts.upsampfac); + if (opts.debug > 1) + printf("[%s] set auto upsampfac=%.2f\n", __func__, opts.upsampfac); } // use opts to choose and write into plan's spread options... - int ier = setup_spreader_for_nufft(p->spopts, tol, p->opts, dim); + ier = setup_spreader_for_nufft(spopts, tol, opts, dim); if (ier > 1) // proceed if success or warning - return ier; + throw int(ier); // set others as defaults (or unallocated for arrays)... - p->X = nullptr; - p->Y = nullptr; - p->Z = nullptr; - p->nf1 = 1; - p->nf2 = 1; - p->nf3 = 1; // crucial to leave as 1 for unused dims + X = nullptr; + Y = nullptr; + Z = nullptr; + nf1 = 1; + nf2 = 1; + nf3 = 1; // crucial to leave as 1 for unused dims // ------------------------ types 1,2: planning needed --------------------- if (type == 1 || type == 2) { @@ -667,97 +655,115 @@ int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) // Note: batchSize not used since might be only 1. - p->spopts.spread_direction = type; + spopts.spread_direction = type; constexpr TF EPSILON = std::numeric_limits::epsilon(); - if (p->opts.showwarn) { // user warn round-off error... - if (EPSILON * p->ms > 1.0) + if (opts.showwarn) { // user warn round-off error... + if (EPSILON * ms > 1.0) fprintf(stderr, "%s warning: rounding err predicted eps_mach*N1 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->ms)); - if (EPSILON * p->mt > 1.0) + __func__, (double)(EPSILON * ms)); + if (EPSILON * mt > 1.0) fprintf(stderr, "%s warning: rounding err predicted eps_mach*N2 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mt)); - if (EPSILON * p->mu > 1.0) + __func__, (double)(EPSILON * mt)); + if (EPSILON * mu > 1.0) fprintf(stderr, "%s warning: rounding err predicted eps_mach*N3 = %.3g > 1 !\n", - __func__, (double)(EPSILON * p->mu)); + __func__, (double)(EPSILON * mu)); } // determine fine grid sizes, sanity check.. - int nfier = set_nf_type12(p->ms, p->opts, p->spopts, &(p->nf1)); - if (nfier) return nfier; // nf too big; we're done - p->phiHat1.resize(p->nf1 / 2 + 1); + int nfier = set_nf_type12(ms, opts, spopts, &nf1); + if (nfier) throw nfier; // nf too big; we're done + phiHat1.resize(nf1 / 2 + 1); if (dim > 1) { - nfier = set_nf_type12(p->mt, p->opts, p->spopts, &(p->nf2)); - if (nfier) return nfier; - p->phiHat2.resize(p->nf2 / 2 + 1); + nfier = set_nf_type12(mt, opts, spopts, &nf2); + if (nfier) throw nfier; + phiHat2.resize(nf2 / 2 + 1); } if (dim > 2) { - nfier = set_nf_type12(p->mu, p->opts, p->spopts, &(p->nf3)); - if (nfier) return nfier; - p->phiHat3.resize(p->nf3 / 2 + 1); + nfier = set_nf_type12(mu, opts, spopts, &nf3); + if (nfier) throw nfier; + phiHat3.resize(nf3 / 2 + 1); } - if (p->opts.debug) { // "long long" here is to avoid warnings with printf... + if (opts.debug) { // "long long" here is to avoid warnings with printf... printf("[%s] %dd%d: (ms,mt,mu)=(%lld,%lld,%lld) " "(nf1,nf2,nf3)=(%lld,%lld,%lld)\n ntrans=%d nthr=%d " "batchSize=%d ", - __func__, dim, type, (long long)p->ms, (long long)p->mt, (long long)p->mu, - (long long)p->nf1, (long long)p->nf2, (long long)p->nf3, ntrans, nthr, - p->batchSize); - if (p->batchSize == 1) // spread_thread has no effect in this case + __func__, dim, type, (long long)ms, (long long)mt, (long long)mu, + (long long)nf1, (long long)nf2, (long long)nf3, ntrans, nthr, batchSize); + if (batchSize == 1) // spread_thread has no effect in this case printf("\n"); else - printf(" spread_thread=%d\n", p->opts.spread_thread); + printf(" spread_thread=%d\n", opts.spread_thread); } // STEP 0: get Fourier coeffs of spreading kernel along each fine grid dim CNTime timer; timer.start(); - onedim_fseries_kernel(p->nf1, p->phiHat1, p->spopts); - if (dim > 1) onedim_fseries_kernel(p->nf2, p->phiHat2, p->spopts); - if (dim > 2) onedim_fseries_kernel(p->nf3, p->phiHat3, p->spopts); - if (p->opts.debug) - printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, p->spopts.nspread, + onedim_fseries_kernel(nf1, phiHat1, spopts); + if (dim > 1) onedim_fseries_kernel(nf2, phiHat2, spopts); + if (dim > 2) onedim_fseries_kernel(nf3, phiHat3, spopts); + if (opts.debug) + printf("[%s] kernel fser (ns=%d):\t\t%.3g s\n", __func__, spopts.nspread, timer.elapsedsec()); - p->nf = p->nf1 * p->nf2 * p->nf3; // fine grid total number of points - if (p->nf * p->batchSize > MAX_NF) { - fprintf(stderr, - "[%s] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", - __func__); - // FIXME: this error causes memory leaks. We should free phiHat1, phiHat2, phiHat3 - return FINUFFT_ERR_MAXNALLOC; + nf = nf1 * nf2 * nf3; // fine grid total number of points + if (nf * batchSize > MAX_NF) { + fprintf( + stderr, + "[%s] fwBatch would be bigger than MAX_NF, not attempting memory allocation!\n", + __func__); + throw int(FINUFFT_ERR_MAXNALLOC); } timer.restart(); - p->fwBatch = p->fftPlan->alloc_complex(p->nf * p->batchSize); // the big workspace - if (p->opts.debug) + fwBatch = fftPlan->alloc_complex(nf * batchSize); // the big workspace + if (opts.debug) printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, - (double)1E-09 * sizeof(std::complex) * p->nf * p->batchSize, + (double)1E-09 * sizeof(std::complex) * nf * batchSize, timer.elapsedsec()); - if (!p->fwBatch) { // we don't catch all such mallocs, just this big one - fprintf(stderr, "[%s] FFTW malloc failed for fwBatch (working fine grids)!\n", + if (!fwBatch) { // we don't catch all such allocs, just this big one + fprintf(stderr, "[%s] FFT allocation failed for fwBatch (working fine grids)!\n", __func__); - return FINUFFT_ERR_ALLOC; + throw int(FINUFFT_ERR_ALLOC); } timer.restart(); // plan the FFTW - const auto ns = gridsize_for_fft(p); - p->fftPlan->plan(ns, p->batchSize, p->fwBatch, p->fftSign, p->opts.fftw, nthr_fft); - if (p->opts.debug) - printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, p->opts.fftw, - nthr_fft, timer.elapsedsec()); + const auto ns = gridsize_for_fft(this); + fftPlan->plan(ns, batchSize, fwBatch, fftSign, opts.fftw, nthr_fft); + if (opts.debug) + printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft, + timer.elapsedsec()); } else { // -------------------------- type 3 (no planning) ------------ - if (p->opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); + if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); // in case destroy occurs before setpts, need safe dummy ptrs/plans... - p->fwBatch = nullptr; - p->innerT2plan = nullptr; + fwBatch = nullptr; + innerT2plan = nullptr; // Type 3 will call finufft_makeplan for type 2; no need to init FFTW // Note we don't even know nj or nk yet, so can't do anything else! } - return ier; // report setup_spreader status (could be warning) + if (ier > 1) throw int(ier); // report setup_spreader status (could be warning) +} + +template +int finufft_makeplan_t(int type, int dim, const BIGINT *n_modes, int iflag, int ntrans, + TF tol, FINUFFT_PLAN_T **pp, finufft_opts *opts) +// Populates the fields of finufft_plan which is pointed to by "pp". +// opts is ptr to a finufft_opts to set options, or nullptr to use defaults. +// For some of the fields (if "auto" selected) here choose the actual setting. +// For types 1,2 allocates memory for internal working arrays, +// evaluates spreading kernel coefficients, and instantiates the fftw_plan +{ + *pp = nullptr; + int ier = 0; + try { + *pp = new FINUFFT_PLAN_T(type, dim, n_modes, iflag, ntrans, tol, opts, ier); + } catch (int errcode) { + return errcode; + } + return ier; } // For this function and the following ones (i.e. everything that is accessible @@ -824,7 +830,7 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF U = u; // pick x, s intervals & shifts & # fine grid pts (nf) in each dim... - TF S1, S2, S3; // get half-width X, center C, which contains {x_j}... + TF S1 = 0, S2 = 0, S3 = 0; // get half-width X, center C, which contains {x_j}... arraywidcen(nj, xj, &(t3P.X1), &(t3P.C1)); arraywidcen(nk, s, &S1, &(t3P.D1)); // same D, S, but for {s_k} set_nhg_type3(S1, t3P.X1, opts, spopts, &(nf1), &(t3P.h1), @@ -858,7 +864,8 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF nf = nf1 * nf2 * nf3; // fine grid total number of points if (nf * batchSize > MAX_NF) { fprintf(stderr, - "[%s t3] fwBatch would be bigger than MAX_NF, not attempting malloc!\n", + "[%s t3] fwBatch would be bigger than MAX_NF, not attempting memory " + "allocation!\n", __func__); return FINUFFT_ERR_MAXNALLOC; } @@ -872,24 +879,24 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF (double)1E-09 * sizeof(std::complex) * (nf + nj) * batchSize, timer.elapsedsec()); if (!fwBatch) { - fprintf(stderr, "[%s t3] malloc fail for fwBatch or CpBatch!\n", __func__); + fprintf(stderr, "[%s t3] allocation fail for fwBatch or CpBatch!\n", __func__); return FINUFFT_ERR_ALLOC; } // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... // FIXME: should use realloc - if (X) free(X); - X = (TF *)malloc(sizeof(TF) * nj); + if (X) delete[] X; + X = new TF[nj]; Sp.resize(nk); if (d > 1) { - if (Y) free(Y); - Y = (TF *)malloc(sizeof(TF) * nj); + if (Y) delete[] Y; + Y = new TF[nj]; Tp.resize(nk); } if (d > 2) { - if (Z) free(Z); - Z = (TF *)malloc(sizeof(TF) * nj); + if (Z) delete[] Z; + Z = new TF[nj]; Up.resize(nk); } @@ -920,9 +927,9 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF } } else for (BIGINT j = 0; j < nj; ++j) - prephase[j] = (std::complex)1.0; // *** or keep flag so no mult in exec?? + prephase[j] = {1, 0}; // *** or keep flag so no mult in exec?? - // rescale the target s_k etc to s'_k etc... + // rescale the target s_k etc to s'_k etc... #pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1); // so |s'_k| < pi/R @@ -947,16 +954,9 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF phiHatk3.resize(nk); onedim_nuft_kernel(nk, Up, phiHatk3, spopts); // fill phiHat3 } - int Cfinite = - std::isfinite(t3P.C1) && std::isfinite(t3P.C2) && std::isfinite(t3P.C3); // C can - // be nan - // or inf - // if - // M=0, - // no - // input - // NU pts - int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0; // cen + // C can be nan or inf if M=0, no input NU pts + int Cfinite = std::isfinite(t3P.C1) && std::isfinite(t3P.C2) && std::isfinite(t3P.C3); + int Cnonzero = t3P.C1 != 0.0 || t3P.C2 != 0.0 || t3P.C3 != 0.0; // cen #pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { // .... loop over NU targ freqs TF phiHat = phiHatk1[k]; @@ -990,20 +990,18 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF t2opts.spread_debug = std::max(0, opts.spread_debug - 1); t2opts.showwarn = 0; // so don't see warnings 2x // (...could vary other t2opts here?) - if (innerT2plan) { - delete innerT2plan; - innerT2plan = nullptr; - } - int ier = finufft_makeplan_t(2, d, t2nmodes, fftSign, batchSize, tol, - &innerT2plan, &t2opts); + // MR: temporary hack, until we have figured out the C++ interface. + FINUFFT_PLAN_T *tmpplan; + int ier = finufft_makeplan_t(2, d, t2nmodes, fftSign, batchSize, tol, &tmpplan, + &t2opts); + innerT2plan.reset(tmpplan); if (ier > 1) { // if merely warning, still proceed fprintf(stderr, "[%s t3]: inner type 2 plan creation failed with ier=%d!\n", __func__, ier); return ier; } - ier = finufft_setpts_t(innerT2plan, nk, Sp.data(), Tp.data(), Up.data(), 0, - nullptr, nullptr, - nullptr); // note nk = # output points (not nj) + ier = innerT2plan->setpts(nk, Sp.data(), Tp.data(), Up.data(), 0, nullptr, nullptr, + nullptr); // note nk = # output points (not nj) if (ier > 1) { fprintf(stderr, "[%s t3]: inner type 2 setpts failed, ier=%d!\n", __func__, ier); return ier; @@ -1013,23 +1011,10 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF } return 0; } -template -int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, - TF *s, TF *t, TF *u) -/* For type 1,2: just checks and (possibly) sorts the NU xyz points, in prep for - spreading. (The last 4 arguments are ignored.) - For type 3: allocates internal working arrays, scales/centers the NU points - and NU target freqs (stu), evaluates spreading kernel FT at all target freqs. -*/ -{ - return p->setpts(nj, xj, yj, zj, nk, s, t, u); -} -template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, float *xj, - float *yj, float *zj, BIGINT nk, float *s, float *t, - float *u); -template int finufft_setpts_t(FINUFFT_PLAN_T *p, BIGINT nj, double *xj, - double *yj, double *zj, BIGINT nk, double *s, - double *t, double *u); +template int FINUFFT_PLAN_T::setpts(BIGINT nj, float *xj, float *yj, float *zj, + BIGINT nk, float *s, float *t, float *u); +template int FINUFFT_PLAN_T::setpts(BIGINT nj, double *xj, double *yj, double *zj, + BIGINT nk, double *s, double *t, double *u); // ............ end setpts .................................................. @@ -1152,7 +1137,7 @@ int FINUFFT_PLAN_T::execute(std::complex *cj, std::complex *fk) { innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array still the same size, as Andrea explained; just wastes a few flops) */ - finufft_execute_t(innerT2plan, fkb, fwBatch); + innerT2plan->execute(fkb, fwBatch); t_t2 += timer.elapsedsec(); // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); @@ -1176,26 +1161,10 @@ int FINUFFT_PLAN_T::execute(std::complex *cj, std::complex *fk) { return 0; } -template -int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk) { - /* See ../docs/cguru.doc for current documentation. - - For given (stack of) weights cj or coefficients fk, performs NUFFTs with - existing (sorted) NU pts and existing plan. - For type 1 and 3: cj is input, fk is output. - For type 2: fk is input, cj is output. - Performs spread/interp, pre/post deconvolve, and FFT as appropriate - for each of the 3 types. - For cases of ntrans>1, performs work in blocks of size up to batchSize. - Return value 0 (no error diagnosis yet). - Barnett 5/20/20, based on Malleo 2019. -*/ - return p->execute(cj, fk); -} -template int finufft_execute_t(FINUFFT_PLAN_T *p, std::complex *cj, - std::complex *fk); -template int finufft_execute_t( - FINUFFT_PLAN_T *p, std::complex *cj, std::complex *fk); +template int FINUFFT_PLAN_T::execute(std::complex *cj, + std::complex *fk); +template int FINUFFT_PLAN_T::execute(std::complex *cj, + std::complex *fk); // DDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDDD template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { @@ -1203,13 +1172,11 @@ template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Also must not crash if called immediately after finufft_makeplan. // Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. - if (fftPlan) fftPlan->free(fwBatch); // free the big FFTW (or t3 spread) working array + if (fftPlan) fftPlan->free(fwBatch); // free the big FFT (or t3 spread) working array if (type == 3) { - delete innerT2plan; - innerT2plan = nullptr; - free(X); - free(Y); - free(Z); + delete[] X; + delete[] Y; + delete[] Z; } } template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 7c7309de2..a0341a0b5 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -1,6 +1,5 @@ // Spreading/interpolating module within FINUFFT. -#include #include #include @@ -9,6 +8,7 @@ #include +#include #include #include #include @@ -164,7 +164,7 @@ constexpr std::array, N> pad_2D_array_with_zeros( const std::array, N> &input) noexcept { constexpr auto pad_with_zeros = [](const auto &input) constexpr noexcept { std::array padded{0}; - for (auto i = 0; i < input.size(); ++i) { + for (size_t i = 0; i < input.size(); ++i) { padded[i] = input[i]; } return padded; @@ -187,20 +187,21 @@ FINUFFT_NEVER_INLINE void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset3, UBIGINT padded_size1, UBIGINT size1, UBIGINT size2, UBIGINT size3, UBIGINT M0) { - printf("size1 %ld, padded_size1 %ld\n", size1, padded_size1); + printf("size1 %" PRIu64 ", padded_size1 %" PRIu64 "\n", size1, padded_size1); switch (ndims) { case 1: - printf("\tsubgrid: off %lld\t siz %lld\t #NU %lld\n", (long long)offset1, - (long long)padded_size1, (long long)M0); + printf("\tsubgrid: off %" PRId64 "\t siz %" PRIu64 "\t #NU %" PRIu64 "\n", offset1, + padded_size1, M0); break; case 2: - printf("\tsubgrid: off %lld,%lld\t siz %lld,%lld\t #NU %lld\n", (long long)offset1, - (long long)offset2, (long long)padded_size1, (long long)size2, (long long)M0); + printf("\tsubgrid: off %" PRId64 ",%" PRId64 "\t siz %" PRIu64 ",%" PRIu64 + "\t #NU %" PRIu64 "\n", + offset1, offset2, padded_size1, size2, M0); break; case 3: - printf("\tsubgrid: off %lld,%lld,%lld\t siz %lld,%lld,%lld\t #NU %lld\n", - (long long)offset1, (long long)offset2, (long long)offset3, - (long long)padded_size1, (long long)size2, (long long)size3, (long long)M0); + printf("\tsubgrid: off %" PRId64 ",%" PRId64 ",%" PRId64 "\t siz %" PRIu64 ",%" PRIu64 + ",%" PRIu64 "\t #NU %" PRIu64 "\n", + offset1, offset2, offset3, padded_size1, size2, size3, M0); break; default: printf("Invalid number of dimensions: %d\n", ndims); @@ -223,13 +224,6 @@ static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept return (result - floor(result)) * T(N); } -template -static FINUFFT_ALWAYS_INLINE simd_type fold_rescale(const simd_type &x, - const BIGINT N) noexcept { - const simd_type x2pi = T(M_1_2PI); - const simd_type result = xsimd::fma(x, x2pi, simd_type(0.5)); - return (result - xsimd::floor(result)) * simd_type(T(N)); -} template static void set_kernel_args(T *args, T x) noexcept // Fills vector args[] with kernel arguments x, x+1, ..., x+ns-1. @@ -400,7 +394,7 @@ static void interp_line_wrap(T *FINUFFT_RESTRICT target, const T *du, const T *k out[0] = std::fma(du[2 * j], ker[dx], out[0]); out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); } - } else if (i1 + ns >= N1) { // wraps at right + } else if (i1 + ns >= BIGINT(N1)) { // wraps at right for (uint8_t dx = 0; dx < N1 - i1; ++dx, ++j) { out[0] = std::fma(du[2 * j], ker[dx], out[0]); out[1] = std::fma(du[2 * j + 1], ker[dx], out[1]); @@ -446,14 +440,13 @@ static void interp_line(T *FINUFFT_RESTRICT target, const T *du, const T *ker, B */ using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr auto regular_part = (2 * ns + padding) & (-(2 * simd_size)); std::array out{0}; const auto j = i1; // removing the wrapping leads up to 10% speedup in certain cases // moved the wrapping to another function to reduce instruction cache pressure - if (i1 < 0 || i1 + ns >= N1 || i1 + ns + (padding + 1) / 2 >= N1) { + if (i1 < 0 || i1 + ns >= BIGINT(N1) || i1 + ns + (padding + 1) / 2 >= BIGINT(N1)) { return interp_line_wrap(target, du, ker, i1, N1); } else { // doesn't wrap // logic largely similar to spread 1D kernel, please see the explanation there @@ -519,7 +512,7 @@ static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T std::array out{0}; using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); - if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2) { + if (i1 >= 0 && i1 + ns <= BIGINT(N1) && i2 >= 0 && i2 + ns <= BIGINT(N2)) { // store a horiz line (interleaved real,imag) alignas(alignment) std::array line{0}; // add remaining const-y lines to the line (expensive inner loop) @@ -539,10 +532,10 @@ static void interp_square_wrap(T *FINUFFT_RESTRICT target, const T *du, const T auto x = i1, y = i2; // initialize coords for (uint8_t d{0}; d < ns; d++) { // set up ptr lists if (x < 0) x += BIGINT(N1); - if (x >= N1) x -= BIGINT(N1); + if (x >= BIGINT(N1)) x -= BIGINT(N1); j1[d] = x++; if (y < 0) y += BIGINT(N2); - if (y >= N2) y -= BIGINT(N2); + if (y >= BIGINT(N2)) y -= BIGINT(N2); j2[d] = y++; } for (uint8_t dy{0}; dy < ns; dy++) { // use the pts lists @@ -598,11 +591,10 @@ static void interp_square(T *FINUFFT_RESTRICT target, const T *du, const T *ker1 // no wrapping: avoid ptrs using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size; - if (i1 >= 0 && i1 + ns <= N1 && i2 >= 0 && i2 + ns <= N2 && - (i1 + ns + (padding + 1) / 2 < N1)) { + if (i1 >= 0 && i1 + ns <= BIGINT(N1) && i2 >= 0 && i2 + ns <= BIGINT(N2) && + (i1 + ns + (padding + 1) / 2 < BIGINT(N1))) { const auto line = [du, N1, i1 = UBIGINT(i1), i2 = UBIGINT(i2), ker2]() constexpr noexcept { // new array du_pts to store the du values for the current y line @@ -666,9 +658,9 @@ static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T */ using arch_t = typename simd_type::arch_type; static constexpr auto alignment = arch_t::alignment(); - const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); - const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); - const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); + const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= BIGINT(N1)); + const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= BIGINT(N2)); + const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= BIGINT(N3)); std::array out{0}; // case no wrapping needed but padding spills over du array. // Hence, no explicit vectorization but the code is still faster @@ -701,13 +693,13 @@ static void interp_cube_wrapped(T *FINUFFT_RESTRICT target, const T *du, const T auto x = i1, y = i2, z = i3; // initialize coords for (uint8_t d{0}; d < ns; d++) { // set up ptr lists if (x < 0) x += BIGINT(N1); - if (x >= N1) x -= BIGINT(N1); + if (x >= BIGINT(N1)) x -= BIGINT(N1); j1[d] = x++; if (y < 0) y += BIGINT(N2); - if (y >= N2) y -= BIGINT(N2); + if (y >= BIGINT(N2)) y -= BIGINT(N2); j2[d] = y++; if (z < 0) z += BIGINT(N3); - if (z >= N3) z -= BIGINT(N3); + if (z >= BIGINT(N3)) z -= BIGINT(N3); j3[d] = z++; } for (uint8_t dz{0}; dz < ns; dz++) { // use the pts lists @@ -763,15 +755,14 @@ static void interp_cube(T *FINUFFT_RESTRICT target, const T *du, const T *ker1, { using arch_t = typename simd_type::arch_type; static constexpr auto padding = get_padding(); - static constexpr auto alignment = arch_t::alignment(); static constexpr auto simd_size = simd_type::size; - static constexpr auto ker23_size = (ns + simd_size - 1) & -simd_size; static constexpr uint8_t line_vectors = (2 * ns + padding) / simd_size; - const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= N1); - const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= N2); - const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= N3); + const auto in_bounds_1 = (i1 >= 0) & (i1 + ns <= BIGINT(N1)); + const auto in_bounds_2 = (i2 >= 0) & (i2 + ns <= BIGINT(N2)); + const auto in_bounds_3 = (i3 >= 0) & (i3 + ns <= BIGINT(N3)); std::array out{0}; - if (in_bounds_1 && in_bounds_2 && in_bounds_3 && (i1 + ns + (padding + 1) / 2 < N1)) { + if (in_bounds_1 && in_bounds_2 && in_bounds_3 && + (i1 + ns + (padding + 1) / 2 < BIGINT(N1))) { const auto line = [N1, N2, i1 = UBIGINT(i1), i2 = UBIGINT(i2), i3 = UBIGINT(i3), ker2, ker3, du]() constexpr noexcept { std::array line{0}; @@ -844,7 +835,7 @@ static FINUFFT_ALWAYS_INLINE auto ker_eval( */ const std::array inputs{elems...}; // compile time loop, no performance overhead - for (auto i = 0; i < sizeof...(elems); ++i) { + for (size_t i = 0; i < sizeof...(elems); ++i) { // compile time branch no performance overhead if constexpr (kerevalmeth == 1) { if (opts.upsampfac == 2.0) { @@ -1365,27 +1356,27 @@ static void add_wrapped_subgrid(BIGINT offset1, BIGINT offset2, BIGINT offset3, }; BIGINT y = offset2, z = offset3; // fill wrapped ptr lists in slower dims y,z... - for (int i = 0; i < size2; ++i) { + for (UBIGINT i = 0; i < size2; ++i) { if (y < 0) y += BIGINT(N2); - if (y >= N2) y -= BIGINT(N2); + if (y >= BIGINT(N2)) y -= BIGINT(N2); o2[i] = y++; } - for (int i = 0; i < size3; ++i) { + for (UBIGINT i = 0; i < size3; ++i) { if (z < 0) z += BIGINT(N3); - if (z >= N3) z -= BIGINT(N3); + if (z >= BIGINT(N3)) z -= BIGINT(N3); o3[i] = z++; } UBIGINT nlo = (offset1 < 0) ? -offset1 : 0; // # wrapping below in x UBIGINT nhi = (offset1 + size1 > N1) ? offset1 + size1 - N1 : 0; // " above in x // this triple loop works in all dims - for (int dz = 0; dz < size3; dz++) { // use ptr lists in each axis + for (UBIGINT dz = 0; dz < size3; dz++) { // use ptr lists in each axis const auto oz = N1 * N2 * o3[dz]; // offset due to z (0 in <3D) - for (int dy = 0; dy < size2; dy++) { + for (UBIGINT dy = 0; dy < size2; dy++) { const auto oy = N1 * o2[dy] + oz; // off due to y & z (0 in 1D) auto *FINUFFT_RESTRICT out = data_uniform + 2 * oy; const auto in = du0 + 2 * padded_size1 * (dy + size2 * dz); // ptr to subgrid array auto o = 2 * (offset1 + N1); // 1d offset for output - for (auto j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts) + for (UBIGINT j = 0; j < 2 * nlo; j++) { // j is really dx/2 (since re,im parts) accumulate(out[j + o], in[j]); } o = 2 * offset1; @@ -1447,7 +1438,7 @@ static void bin_sort_singlethread(std::vector &ret, UBIGINT M, const T * // count how many pts in each bin std::vector counts(nbins, 0); - for (auto i = 0; i < M; i++) { + for (UBIGINT i = 0; i < M; i++) { // find the bin index in however many dims are needed const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; @@ -1464,7 +1455,7 @@ static void bin_sort_singlethread(std::vector &ret, UBIGINT M, const T * current_offset += tmp; } // (counts now contains the index offsets for each bin) - for (auto i = 0; i < M; i++) { + for (UBIGINT i = 0; i < M; i++) { // find the bin index (again! but better than using RAM) const auto i1 = BIGINT(fold_rescale(kx[i], N1) * inv_bin_size_x); const auto i2 = isky ? BIGINT(fold_rescale(ky[i], N2) * inv_bin_size_y) : 0; @@ -1726,7 +1717,7 @@ int spreadcheck(UBIGINT N1, UBIGINT N2, UBIGINT N3, UBIGINT M, T *kx, T *ky, T * */ { // INPUT CHECKING & REPORTING .... cuboid not too small for spreading? - int minN = 2 * opts.nspread; + UBIGINT minN = UBIGINT(2 * opts.nspread); if (N1 < minN || (N2 > 1 && N2 < minN) || (N3 > 1 && N3 < minN)) { fprintf(stderr, "%s error: one or more non-trivial box dims is less than 2.nspread!\n", @@ -1815,8 +1806,8 @@ int indexSort(std::vector &sort_indices, UBIGINT N1, UBIGINT N2, UBIGINT did_sort = 1; } else { #pragma omp parallel for num_threads(maxnthr) schedule(static, 1000000) - for (BIGINT i = 0; i < M; i++) // here omp helps xeon, hinders i7 - sort_indices[i] = i; // the identity permutation + for (BIGINT i = 0; i < BIGINT(M); i++) // here omp helps xeon, hinders i7 + sort_indices[i] = i; // the identity permutation if (opts.debug) printf("\tnot sorted (sort=%d): \t%.3g s\n", (int)opts.sort, timer.elapsedsec()); } @@ -1887,21 +1878,22 @@ static int spreadSorted(const std::vector &sort_indices, UBIGINT N1, UBI printf("\tnthr big: switching add_wrapped OMP from critical to atomic (!)\n"); std::vector brk(nb + 1); // NU index breakpoints defining nb subproblems - for (int p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; + for (UBIGINT p = 0; p <= nb; ++p) brk[p] = (M * p + nb - 1) / nb; #pragma omp parallel num_threads(nthr) { // local copies of NU pts and data for each subproblem std::vector kx0{}, ky0{}, kz0{}, dd0{}, du0{}; -#pragma omp for schedule(dynamic, 1) // each is big - for (int isub = 0; isub < nb; isub++) { // Main loop through the subproblems - const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem +#pragma omp for schedule(dynamic, 1) // each is big + for (BIGINT isub = 0; isub < BIGINT(nb); isub++) { // Main loop through the + // subproblems + const auto M0 = brk[isub + 1] - brk[isub]; // # NU pts in this subproblem // copy the location and data vectors for the nonuniform points kx0.resize(M0); ky0.resize(M0 * (N2 > 1)); kz0.resize(M0 * (N3 > 1)); dd0.resize(2 * M0); // complex strength data - for (auto j = 0; j < M0; j++) { // todo: can avoid this copying? + for (UBIGINT j = 0; j < M0; j++) { // todo: can avoid this copying? const auto kk = sort_indices[j + brk[isub]]; // NU pt from subprob index list kx0[j] = fold_rescale(kx[kk], N1); if (N2 > 1) ky0[j] = fold_rescale(ky[kk], N2); @@ -1950,7 +1942,8 @@ static int spreadSorted(const std::vector &sort_indices, UBIGINT N1, UBI } // end main loop over subprobs } if (opts.debug) - printf("\tt1 fancy spread: \t%.3g s (%ld subprobs)\n", timer.elapsedsec(), nb); + printf("\tt1 fancy spread: \t%.3g s (%" PRIu64 " subprobs)\n", timer.elapsedsec(), + nb); } // end of choice of which t1 spread type to use return 0; }; @@ -1998,10 +1991,10 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( // main loop over NU trgs, interp each from U // (note: windows omp doesn't like unsigned loop vars) #pragma omp for schedule(dynamic, 1000) // assign threads to NU targ pts: - for (BIGINT i = 0; i < M; i += CHUNKSIZE) { + for (BIGINT i = 0; i < BIGINT(M); i += CHUNKSIZE) { // Setup buffers for this chunk const UBIGINT bufsize = (i + CHUNKSIZE > M) ? M - i : CHUNKSIZE; - for (int ibuf = 0; ibuf < bufsize; ibuf++) { + for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) { UBIGINT j = sort_indices[i + ibuf]; jlist[ibuf] = j; xjlist[ibuf] = fold_rescale(kx[j], N1); @@ -2010,7 +2003,7 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( } // Loop over targets in chunk - for (int ibuf = 0; ibuf < bufsize; ibuf++) { + for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) { const auto xj = xjlist[ibuf]; const auto yj = (ndims > 1) ? yjlist[ibuf] : 0; const auto zj = (ndims > 2) ? zjlist[ibuf] : 0; @@ -2052,7 +2045,7 @@ FINUFFT_NEVER_INLINE static int interpSorted_kernel( } // end loop over targets in chunk // Copy result buffer to output array - for (int ibuf = 0; ibuf < bufsize; ibuf++) { + for (UBIGINT ibuf = 0; ibuf < bufsize; ibuf++) { const UBIGINT j = jlist[ibuf]; data_nonuniform[2 * j] = outbuf[2 * ibuf]; data_nonuniform[2 * j + 1] = outbuf[2 * ibuf + 1]; @@ -2154,8 +2147,9 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps opts. dim is spatial dimension (1,2, or 3). See finufft.cpp:finufft_plan() for where upsampfac is set. Must call this before any kernel evals done, otherwise segfault likely. Returns: 0 : success FINUFFT_WARN_EPS_TOO_SMALL : requested eps cannot be - achieved, but proceed with best possible eps otherwise : failure (see codes in defs.h); - spreading must not proceed Barnett 2017. debug, loosened eps logic 6/14/20. + achieved, but proceed with best possible eps otherwise : failure (see codes in + finufft_errors.h); spreading must not proceed Barnett 2017. debug, loosened eps logic + 6/14/20. */ { constexpr T EPSILON = std::numeric_limits::epsilon(); @@ -2206,7 +2200,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps if (upsampfac == 2.0) // standard sigma (see SISC paper) ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10 else // custom sigma - ns = std::ceil(-log(eps) / (PI * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 + ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 ns = max(2, ns); // (we don't have ns=1 version yet) if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules if (showwarn) @@ -2228,7 +2222,8 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps if (ns == 4) betaoverns = 2.38; if (upsampfac != 2.0) { // again, override beta for custom sigma T gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * PI * (1.0 - 1.0 / (2 * upsampfac)); // formula based on cutoff + betaoverns = gamma * T(M_PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based on + // cutoff } opts.ES_beta = betaoverns * ns; // set the kernel beta parameter if (debug) diff --git a/src/utils.cpp b/src/utils.cpp index f64009132..2627a179a 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -73,7 +73,7 @@ int get_num_threads_parallel_block() } // ---------- thread-safe rand number generator for Windows platform --------- -// (note this is used by macros in defs.h, and supplied in linux/macosx) +// (note this is used by macros in test_defs.h, and supplied in linux/macosx) #ifdef _WIN32 int rand_r(unsigned int * /*seedp*/) // Libin Lu, 6/18/20 diff --git a/test/directft/dirft1d.cpp b/test/directft/dirft1d.cpp index 5f36d76d7..c80299b47 100644 --- a/test/directft/dirft1d.cpp +++ b/test/directft/dirft1d.cpp @@ -1,11 +1,13 @@ -#include #include +#include #include // This is basically a port of dirft1d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft1d1(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f) +template +void dirft1d1(BIGINT nj, T *x, std::complex *c, int iflag, BIGINT ms, + std::complex *f) /* Direct computation of 1D type-1 nonuniform FFT. Interface same as finufft1d1. c nj-1 c f[k1] = SUM c[j] exp(+-i k1 x[j]) @@ -17,12 +19,14 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 1/25/17 */ { - BIGINT kmin = -(ms / 2); // integer divide - for (BIGINT m = 0; m < ms; ++m) f[m] = CPX(0, 0); // it knows f is complex type + BIGINT kmin = -(ms / 2); // integer divide + for (BIGINT m = 0; m < ms; ++m) + f[m] = std::complex(0, 0); // it knows f is complex type for (BIGINT j = 0; j < nj; ++j) { - CPX a = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX p = pow(a, (FLT)kmin); // starting phase for most neg freq - CPX cc = c[j]; // no 1/nj prefac + std::complex a = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex p = pow(a, (T)kmin); // starting phase for most neg freq + std::complex cc = c[j]; // no 1/nj prefac for (BIGINT m = 0; m < ms; ++m) { f[m] += cc * p; p *= a; @@ -30,7 +34,9 @@ c used, otherwise the - sign is used, in the exponential. } } -void dirft1d2(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT ms, CPX *f) +template +void dirft1d2(BIGINT nj, T *x, std::complex *c, int iflag, BIGINT ms, + std::complex *f) /* Direct computation of 1D type-2 nonuniform FFT. Interface same as finufft1d2 c c c[j] = SUM f[k1] exp(+-i k1 x[j]) @@ -44,9 +50,10 @@ c used, otherwise the - sign is used, in the exponential. { BIGINT kmin = -(ms / 2); // integer divide for (BIGINT j = 0; j < nj; ++j) { - CPX a = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX p = pow(a, (FLT)kmin); // starting phase for most neg freq - CPX cc = CPX(0, 0); + std::complex a = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex p = pow(a, (T)kmin); // starting phase for most neg freq + std::complex cc = std::complex(0, 0); for (BIGINT m = 0; m < ms; ++m) { cc += f[m] * p; p *= a; @@ -55,7 +62,9 @@ c used, otherwise the - sign is used, in the exponential. } } -void dirft1d3(BIGINT nj, FLT *x, CPX *c, int iflag, BIGINT nk, FLT *s, CPX *f) +template +void dirft1d3(BIGINT nj, T *x, std::complex *c, int iflag, BIGINT nk, T *s, + std::complex *f) /* Direct computation of 1D type-3 nonuniform FFT. Interface same as finufft1d3 c nj-1 c f[k] = SUM c[j] exp(+-i s[k] x[j]) @@ -66,8 +75,9 @@ c exponential. Uses C++ complex type. Simple brute force. Barnett 1/25/17 */ { for (BIGINT k = 0; k < nk; ++k) { - CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k]; - f[k] = CPX(0, 0); + std::complex ss = + (iflag > 0) ? std::complex(0, 1) * s[k] : -std::complex(0, 1) * s[k]; + f[k] = std::complex(0, 0); for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j]); } } diff --git a/test/directft/dirft2d.cpp b/test/directft/dirft2d.cpp index c13661549..62f126c15 100644 --- a/test/directft/dirft2d.cpp +++ b/test/directft/dirft2d.cpp @@ -1,11 +1,13 @@ -#include #include +#include #include // This is basically a port of dirft2d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft2d1(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f) +template +void dirft2d1(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT ms, BIGINT mt, + std::complex *f) /* Direct computation of 2D type-1 nonuniform FFT. Interface same as finufft2d1. c nj-1 c f[k1,k2] = SUM c[j] exp(+-i (k1 x[j] + k2 y[j])) @@ -18,19 +20,22 @@ c used, otherwise the - sign is used, in the exponential. * Uses C++ complex type and winding trick. Barnett 1/26/17 */ { - BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide - BIGINT N = ms * mt; // total # output modes - for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type - for (BIGINT j = 0; j < nj; ++j) { // src pts - CPX a1 = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); - CPX sp1 = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq - CPX p2 = pow(a2, (FLT)k2min); - CPX cc = c[j]; // no 1/nj norm - BIGINT m = 0; // output pointer + BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide + BIGINT N = ms * mt; // total # output modes + for (BIGINT m = 0; m < N; ++m) + f[m] = std::complex(0, 0); // it knows f is complex type + for (BIGINT j = 0; j < nj; ++j) { // src pts + std::complex a1 = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex a2 = (iflag > 0) ? exp(std::complex(0, 1) * y[j]) + : exp(-std::complex(0, 1) * y[j]); + std::complex sp1 = pow(a1, (T)k1min); // starting phase for most neg k1 freq + std::complex p2 = pow(a2, (T)k2min); + std::complex cc = c[j]; // no 1/nj norm + BIGINT m = 0; // output pointer for (BIGINT m2 = 0; m2 < mt; ++m2) { - CPX p1 = sp1; // must reset p1 for each inner loop - for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow + std::complex p1 = sp1; // must reset p1 for each inner loop + for (BIGINT m1 = 0; m1 < ms; ++m1) { // ms is fast, mt slow f[m++] += cc * p1 * p2; p1 *= a1; } @@ -39,7 +44,9 @@ c used, otherwise the - sign is used, in the exponential. } } -void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt, CPX *f) +template +void dirft2d2(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT ms, BIGINT mt, + std::complex *f) /* Direct computation of 2D type-2 nonuniform FFT. Interface same as finufft2d2 c[j] = SUM f[k1,k2] exp(+-i (k1 x[j] + k2 y[j])) @@ -56,14 +63,16 @@ void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt { BIGINT k1min = -(ms / 2), k2min = -(mt / 2); // integer divide for (BIGINT j = 0; j < nj; ++j) { - CPX a1 = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); - CPX sp1 = pow(a1, (FLT)k1min); - CPX p2 = pow(a2, (FLT)k2min); - CPX cc = CPX(0, 0); - BIGINT m = 0; // input pointer + std::complex a1 = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex a2 = (iflag > 0) ? exp(std::complex(0, 1) * y[j]) + : exp(-std::complex(0, 1) * y[j]); + std::complex sp1 = pow(a1, (T)k1min); + std::complex p2 = pow(a2, (T)k2min); + std::complex cc = std::complex(0, 0); + BIGINT m = 0; // input pointer for (BIGINT m2 = 0; m2 < mt; ++m2) { - CPX p1 = sp1; + std::complex p1 = sp1; for (BIGINT m1 = 0; m1 < ms; ++m1) { cc += f[m++] * p1 * p2; p1 *= a1; @@ -74,8 +83,9 @@ void dirft2d2(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT ms, BIGINT mt } } -void dirft2d3(BIGINT nj, FLT *x, FLT *y, CPX *c, int iflag, BIGINT nk, FLT *s, FLT *t, - CPX *f) +template +void dirft2d3(BIGINT nj, T *x, T *y, std::complex *c, int iflag, BIGINT nk, T *s, T *t, + std::complex *f) /* Direct computation of 2D type-3 nonuniform FFT. Interface same as finufft2d3 c nj-1 c f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j])) @@ -86,9 +96,11 @@ c exponential. Uses C++ complex type. Simple brute force. Barnett 1/26/17 */ { for (BIGINT k = 0; k < nk; ++k) { - CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k]; - CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; - f[k] = CPX(0, 0); + std::complex ss = + (iflag > 0) ? std::complex(0, 1) * s[k] : -std::complex(0, 1) * s[k]; + std::complex tt = + (iflag > 0) ? std::complex(0, 1) * t[k] : -std::complex(0, 1) * t[k]; + f[k] = std::complex(0, 0); for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j]); } } diff --git a/test/directft/dirft3d.cpp b/test/directft/dirft3d.cpp index 452b62471..b77111257 100644 --- a/test/directft/dirft3d.cpp +++ b/test/directft/dirft3d.cpp @@ -1,12 +1,13 @@ -#include #include +#include #include // This is basically a port of dirft2d.f from CMCL package, except with // the 1/nj prefactors for type-1 removed. -void dirft3d1(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, - BIGINT mu, CPX *f) +template +void dirft3d1(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT ms, + BIGINT mt, BIGINT mu, std::complex *f) /* Direct computation of 3D type-1 nonuniform FFT. Interface same as finufft3d1. c nj-1 c f[k1,k2,k3] = SUM c[j] exp(+-i (k1 x[j] + k2 y[j] + k2 z[j])) @@ -22,20 +23,24 @@ c used, otherwise the - sign is used, in the exponential. { BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide BIGINT N = ms * mt * mu; // total # output modes - for (BIGINT m = 0; m < N; ++m) f[m] = CPX(0, 0); // it knows f is complex type - for (BIGINT j = 0; j < nj; ++j) { // src pts - CPX a1 = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); - CPX a3 = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]); - CPX sp1 = pow(a1, (FLT)k1min); // starting phase for most neg k1 freq - CPX sp2 = pow(a2, (FLT)k2min); - CPX p3 = pow(a3, (FLT)k3min); - CPX cc = c[j]; // no 1/nj norm - BIGINT m = 0; // output pointer + for (BIGINT m = 0; m < N; ++m) + f[m] = std::complex(0, 0); // it knows f is complex type + for (BIGINT j = 0; j < nj; ++j) { // src pts + std::complex a1 = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex a2 = (iflag > 0) ? exp(std::complex(0, 1) * y[j]) + : exp(-std::complex(0, 1) * y[j]); + std::complex a3 = (iflag > 0) ? exp(std::complex(0, 1) * z[j]) + : exp(-std::complex(0, 1) * z[j]); + std::complex sp1 = pow(a1, (T)k1min); // starting phase for most neg k1 freq + std::complex sp2 = pow(a2, (T)k2min); + std::complex p3 = pow(a3, (T)k3min); + std::complex cc = c[j]; // no 1/nj norm + BIGINT m = 0; // output pointer for (BIGINT m3 = 0; m3 < mu; ++m3) { - CPX p2 = sp2; + std::complex p2 = sp2; for (BIGINT m2 = 0; m2 < mt; ++m2) { - CPX p1 = sp1; + std::complex p1 = sp1; for (BIGINT m1 = 0; m1 < ms; ++m1) { f[m++] += cc * p1 * p2 * p3; p1 *= a1; @@ -47,8 +52,9 @@ c used, otherwise the - sign is used, in the exponential. } } -void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, BIGINT mt, - BIGINT mu, CPX *f) +template +void dirft3d2(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT ms, + BIGINT mt, BIGINT mu, std::complex *f) /* Direct computation of 3D type-2 nonuniform FFT. Interface same as finufft3d2 c[j] = SUM f[k1,k2,k3] exp(+-i (k1 x[j] + k2 y[j] + k3 z[j])) @@ -66,18 +72,21 @@ void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, B { BIGINT k1min = -(ms / 2), k2min = -(mt / 2), k3min = -(mu / 2); // integer divide for (BIGINT j = 0; j < nj; ++j) { - CPX a1 = (iflag > 0) ? exp(IMA * x[j]) : exp(-IMA * x[j]); - CPX a2 = (iflag > 0) ? exp(IMA * y[j]) : exp(-IMA * y[j]); - CPX a3 = (iflag > 0) ? exp(IMA * z[j]) : exp(-IMA * z[j]); - CPX sp1 = pow(a1, (FLT)k1min); - CPX sp2 = pow(a2, (FLT)k2min); - CPX p3 = pow(a3, (FLT)k3min); - CPX cc = CPX(0, 0); - BIGINT m = 0; // input pointer + std::complex a1 = (iflag > 0) ? exp(std::complex(0, 1) * x[j]) + : exp(-std::complex(0, 1) * x[j]); + std::complex a2 = (iflag > 0) ? exp(std::complex(0, 1) * y[j]) + : exp(-std::complex(0, 1) * y[j]); + std::complex a3 = (iflag > 0) ? exp(std::complex(0, 1) * z[j]) + : exp(-std::complex(0, 1) * z[j]); + std::complex sp1 = pow(a1, (T)k1min); + std::complex sp2 = pow(a2, (T)k2min); + std::complex p3 = pow(a3, (T)k3min); + std::complex cc = std::complex(0, 0); + BIGINT m = 0; // input pointer for (BIGINT m3 = 0; m3 < mu; ++m3) { - CPX p2 = sp2; + std::complex p2 = sp2; for (BIGINT m2 = 0; m2 < mt; ++m2) { - CPX p1 = sp1; + std::complex p1 = sp1; for (BIGINT m1 = 0; m1 < ms; ++m1) { cc += f[m++] * p1 * p2 * p3; p1 *= a1; @@ -90,8 +99,9 @@ void dirft3d2(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT ms, B } } -void dirft3d3(BIGINT nj, FLT *x, FLT *y, FLT *z, CPX *c, int iflag, BIGINT nk, FLT *s, - FLT *t, FLT *u, CPX *f) +template +void dirft3d3(BIGINT nj, T *x, T *y, T *z, std::complex *c, int iflag, BIGINT nk, T *s, + T *t, T *u, std::complex *f) /* Direct computation of 3D type-3 nonuniform FFT. Interface same as finufft3d3 c nj-1 c f[k] = SUM c[j] exp(+-i (s[k] x[j] + t[k] y[j] + u[k] z[j])) @@ -102,10 +112,13 @@ c exponential. Uses C++ complex type. Simple brute force. Barnett 2/1/17 */ { for (BIGINT k = 0; k < nk; ++k) { - CPX ss = (iflag > 0) ? IMA * s[k] : -IMA * s[k]; - CPX tt = (iflag > 0) ? IMA * t[k] : -IMA * t[k]; - CPX uu = (iflag > 0) ? IMA * u[k] : -IMA * u[k]; - f[k] = CPX(0, 0); + std::complex ss = + (iflag > 0) ? std::complex(0, 1) * s[k] : -std::complex(0, 1) * s[k]; + std::complex tt = + (iflag > 0) ? std::complex(0, 1) * t[k] : -std::complex(0, 1) * t[k]; + std::complex uu = + (iflag > 0) ? std::complex(0, 1) * u[k] : -std::complex(0, 1) * u[k]; + f[k] = std::complex(0, 0); for (BIGINT j = 0; j < nj; ++j) f[k] += c[j] * exp(ss * x[j] + tt * y[j] + uu * z[j]); } } diff --git a/test/dumbinputs.cpp b/test/dumbinputs.cpp index 5e74d6d30..e182cfe5f 100644 --- a/test/dumbinputs.cpp +++ b/test/dumbinputs.cpp @@ -103,7 +103,7 @@ int main(int argc, char *argv[]) { printf("1d1 M<0:\twrong err code %d\n", ier); return 1; } - int64_t Mhuge = (int64_t)(1e16); // cf defs.h MAX_NU_PTS + int64_t Mhuge = (int64_t)(1e16); // cf finufft_core.h MAX_NU_PTS ier = FINUFFT1D1(Mhuge, x, c, +1, acc, 0, F, &opts); if (ier != FINUFFT_ERR_NUM_NU_PTS_INVALID) { printf("1d1 M huge:\twrong err code %d\n", ier); From 34b428de10e66a7f4267d9a2f6d78b0db25c0ae3 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 22 Oct 2024 22:37:18 +0200 Subject: [PATCH 02/11] finufft.cpp -> finufft_core.cpp --- CMakeLists.txt | 4 ++-- makefile | 2 +- src/{finufft.cpp => finufft_core.cpp} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename src/{finufft.cpp => finufft_core.cpp} (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea8b76c7..ecee27aee 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,7 +257,7 @@ if(FINUFFT_USE_CPU) src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp - src/finufft.cpp + src/finufft_core.cpp src/c_interface.cpp fortran/finufftfort.cpp) else() @@ -267,7 +267,7 @@ if(FINUFFT_USE_CPU) src/utils.cpp contrib/legendre_rule_fast.cpp src/fft.cpp - src/finufft.cpp + src/finufft_core.cpp src/c_interface.cpp fortran/finufftfort.cpp) endif() diff --git a/makefile b/makefile index 7a715a7d5..90e2db250 100644 --- a/makefile +++ b/makefile @@ -137,7 +137,7 @@ ABSDYNLIB = $(FINUFFT)$(DYNLIB) SOBJS = src/utils.o src/spreadinterp.o # all lib dual-precision objs (note DUCC_OBJS empty if unused) -OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS) +OBJS = $(SOBJS) contrib/legendre_rule_fast.o src/fft.o src/finufft_core.o src/c_interface.o fortran/finufftfort.o $(DUCC_OBJS) .PHONY: usage lib examples test perftest spreadtest spreadtestall fortran matlab octave all mex python clean objclean pyclean mexclean wheel docker-wheel gurutime docs setup setupclean diff --git a/src/finufft.cpp b/src/finufft_core.cpp similarity index 100% rename from src/finufft.cpp rename to src/finufft_core.cpp From 91206c4c08094194a35f4892bf1fd388b0730b02 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Wed, 23 Oct 2024 12:41:01 +0200 Subject: [PATCH 03/11] avoid more explicit memory management --- include/finufft/finufft_core.h | 23 +++++++++++++---------- src/finufft_core.cpp | 30 +++++++++--------------------- 2 files changed, 22 insertions(+), 31 deletions(-) diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index cdfa4ab33..8f473dc86 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -171,10 +171,10 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ BIGINT mu; // number of modes in z (3) direction = N3 BIGINT N; // total # modes (prod of above three) - BIGINT nf1; // size of internal fine grid in x (1) direction - BIGINT nf2; // " y (2) - BIGINT nf3; // " z (3) - BIGINT nf; // total # fine grid points (product of the above three) + BIGINT nf1 = 1; // size of internal fine grid in x (1) direction + BIGINT nf2 = 1; // " y (2) + BIGINT nf3 = 1; // " z (3) + BIGINT nf = 1; // total # fine grid points (product of the above three) int fftSign; // sign in exponential for NUFFT defn, guaranteed to be +-1 @@ -182,15 +182,18 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ std::vector phiHat2; // " y-axis. std::vector phiHat3; // " z-axis. - TC *fwBatch = nullptr; // (batches of) fine grid(s) for FFTW to plan - // & act on. Usually the largest working array + TC *fwBatch = nullptr; // (batches of) fine grid(s) for the FFT to plan + // & act on. Usually the largest working array. + // Unfortunately this can't be a vector, since + // in the presence of FFTW it must be allocated + // by fftw_malloc. std::vector sortIndices; // precomputed NU pt permutation, speeds spread/interp bool didSort; // whether binsorting used (false: identity perm used) TF *X = nullptr, *Y = nullptr, *Z = nullptr; // for t1,2: ptr to user-supplied NU pts - // (no new allocs). for t3: allocated as - // "primed" (scaled) src pts x'_j, etc + // (no new allocs). for t3: points to + // "primed" (scaled) Xp, Yp, Zp // type 3 specific TF *S = nullptr, *T = nullptr, *U = nullptr; // pointers to user's target NU pts arrays @@ -198,8 +201,8 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ std::vector prephase; // pre-phase, for all input NU pts std::vector deconv; // reciprocal of kernel FT, phase, all output NU pts std::vector CpBatch; // working array of prephased strengths - std::vector Sp, Tp, Up; // internal primed targs (s'_k, etc), - // allocated + std::vector Xp, Yp, Zp; // internal primed NU points (x'_j, etc) + std::vector Sp, Tp, Up; // internal primed targs (s'_k, etc) type3params t3P; // groups together type 3 shift, scale, phase, parameters std::unique_ptr> innerT2plan; // ptr used for type 2 in step 2 of // type 3 diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 2130cfd8a..464ac5446 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -641,18 +641,10 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i if (ier > 1) // proceed if success or warning throw int(ier); - // set others as defaults (or unallocated for arrays)... - X = nullptr; - Y = nullptr; - Z = nullptr; - nf1 = 1; - nf2 = 1; - nf3 = 1; // crucial to leave as 1 for unused dims - // ------------------------ types 1,2: planning needed --------------------- if (type == 1 || type == 2) { - int nthr_fft = nthr; // give FFTW all threads (or use o.spread_thread?) + int nthr_fft = nthr; // give FFT all threads (or use o.spread_thread?) // Note: batchSize not used since might be only 1. spopts.spread_direction = type; @@ -885,18 +877,19 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... - // FIXME: should use realloc - if (X) delete[] X; - X = new TF[nj]; + // We do this by resizing Xp, Yp, and Zp, and pointing X, Y, Z to their data; + // this avoids any need for explicit cleanup. + Xp.resize(nj); + X = Xp.data(); Sp.resize(nk); if (d > 1) { - if (Y) delete[] Y; - Y = new TF[nj]; + Yp.resize(nj); + Y = Yp.data(); Tp.resize(nk); } if (d > 2) { - if (Z) delete[] Z; - Z = new TF[nj]; + Zp.resize(nj); + Z = Zp.data(); Up.resize(nk); } @@ -1173,11 +1166,6 @@ template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. if (fftPlan) fftPlan->free(fwBatch); // free the big FFT (or t3 spread) working array - if (type == 3) { - delete[] X; - delete[] Y; - delete[] Z; - } } template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); From cebfb73ea02b18b99d9ba1fd2ef9fba91872eb7b Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Sun, 27 Oct 2024 19:38:53 +0100 Subject: [PATCH 04/11] make PI constexpr; switch from FFTW allocation to aligned C++ allocators --- include/finufft/fft.h | 16 ---- include/finufft/finufft_core.h | 23 ++--- include/finufft/test_defs.h | 3 - perftest/guru_timing_test.cpp | 6 +- perftest/manysmallprobs.cpp | 8 +- perftest/perftest.cpp | 12 +-- perftest/results/nuffttestnd_results.txt | 103 +++++++++++++++++++++ perftest/results/nuffttestndf_results.txt | 103 +++++++++++++++++++++ perftest/results/spreadtestnd_results.txt | 97 +++++++++++++++++++ perftest/results/spreadtestndf_results.txt | 97 +++++++++++++++++++ perftest/spreadtestnd.cpp | 12 +-- perftest/spreadtestndall.cpp | 16 ++-- src/fft.cpp | 2 +- src/finufft_core.cpp | 36 +++---- src/spreadinterp.cpp | 10 +- test/basicpassfail.cpp | 10 +- test/finufft1dmany_test.cpp | 6 +- test/finufft2d_test.cpp | 8 +- test/finufft2dmany_test.cpp | 8 +- test/finufft3d_test.cpp | 12 +-- test/finufft3dkernel_test.cpp | 12 +-- test/finufft3dmany_test.cpp | 12 +-- 22 files changed, 486 insertions(+), 126 deletions(-) create mode 100644 perftest/results/nuffttestnd_results.txt create mode 100644 perftest/results/nuffttestndf_results.txt create mode 100644 perftest/results/spreadtestnd_results.txt create mode 100644 perftest/results/spreadtestndf_results.txt diff --git a/include/finufft/fft.h b/include/finufft/fft.h index 1fd8d1635..4d1a0d9c2 100644 --- a/include/finufft/fft.h +++ b/include/finufft/fft.h @@ -16,10 +16,6 @@ template class Finufft_FFT_plan { [[maybe_unused]] void plan(const std::vector & /*dims*/, size_t /*batchSize*/, std::complex * /*ptr*/, int /*sign*/, int /*options*/, int /*nthreads*/) {} - [[maybe_unused]] static std::complex *alloc_complex(size_t N) { - return new std::complex[N]; - } - [[maybe_unused]] static void free(std::complex *ptr) { delete[] ptr; } [[maybe_unused]] static void forget_wisdom() {} [[maybe_unused]] static void cleanup() {} @@ -91,12 +87,6 @@ template<> struct Finufft_FFT_plan { 1, int(nf), sign, unsigned(options)); unlock(); } - static std::complex *alloc_complex [[maybe_unused]] (size_t N) { - return reinterpret_cast *>(fftwf_alloc_complex(N)); - } - static void free [[maybe_unused]] (std::complex *ptr) { - if (ptr) fftwf_free(reinterpret_cast(ptr)); - } void execute [[maybe_unused]] () { fftwf_execute(plan_); } static void forget_wisdom [[maybe_unused]] () { fftwf_forget_wisdom(); } @@ -161,12 +151,6 @@ template<> struct Finufft_FFT_plan { sign, unsigned(options)); unlock(); } - static std::complex *alloc_complex [[maybe_unused]] (size_t N) { - return reinterpret_cast *>(fftw_alloc_complex(N)); - } - static void free [[maybe_unused]] (std::complex *ptr) { - fftw_free(reinterpret_cast(ptr)); - } void execute [[maybe_unused]] () { fftw_execute(plan_); } static void forget_wisdom [[maybe_unused]] () { fftw_forget_wisdom(); } diff --git a/include/finufft/finufft_core.h b/include/finufft/finufft_core.h index 8f473dc86..8832a654c 100644 --- a/include/finufft/finufft_core.h +++ b/include/finufft/finufft_core.h @@ -61,6 +61,7 @@ #include #include +#include // All indexing in library that potentially can exceed 2^31 uses 64-bit signed. // This includes all calling arguments (eg M,N) that could be huge someday. @@ -95,16 +96,9 @@ inline constexpr BIGINT MAX_NF = BIGINT(1e12); // values for M = nj (also nk in type 3)... inline constexpr BIGINT MAX_NU_PTS = BIGINT(1e14); -// MR: In the longer term I suggest to move -// away from M_PI, which was never part of the standard. -// Perhaps a constexpr pi in the namespace finufft, or a constexpr finufft_pi -// if no namespaces are used? -// In C++20 these constants will be part of the language, and the problem will go away. -#ifndef M_PI // Windows apparently doesn't have this const -#define M_PI 3.14159265358979329 -#endif -#define M_1_2PI 0.159154943091895336 -#define M_2PI 6.28318530717958648 +// We define our own PI here because M_PI is not actually part of standard C++ +inline constexpr double PI = 3.14159265358979329; +inline constexpr double INV_2PI = 0.159154943091895336; // ----- OpenMP macros which also work when omp not present ----- // Allows compile-time switch off of openmp, so compilation without any openmp @@ -182,11 +176,10 @@ template struct FINUFFT_PLAN_T { // the main plan object, fully C++ std::vector phiHat2; // " y-axis. std::vector phiHat3; // " z-axis. - TC *fwBatch = nullptr; // (batches of) fine grid(s) for the FFT to plan - // & act on. Usually the largest working array. - // Unfortunately this can't be a vector, since - // in the presence of FFTW it must be allocated - // by fftw_malloc. + std::vector> fwBatch; // (batches of) fine grid(s) + // for the FFT to plan & act + // on. Usually the largest + // working array. std::vector sortIndices; // precomputed NU pt permutation, speeds spread/interp bool didSort; // whether binsorting used (false: identity perm used) diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h index e8e80e9f8..556315242 100644 --- a/include/finufft/test_defs.h +++ b/include/finufft/test_defs.h @@ -40,9 +40,6 @@ using CPX = std::complex; // either-precision unit imaginary number... #define IMA (CPX(0.0, 1.0)) -// to avoid mixed precision operators in eg i*pi, an either-prec PI... -#define PI FLT(M_PI) - // Random numbers: crappy unif random number generator in [0,1). // These macros should probably be replaced by modern C++ std lib or random123. // (RAND_MAX is in stdlib.h) diff --git a/perftest/guru_timing_test.cpp b/perftest/guru_timing_test.cpp index b5030a852..97b8da053 100644 --- a/perftest/guru_timing_test.cpp +++ b/perftest/guru_timing_test.cpp @@ -135,9 +135,9 @@ int main(int argc, char *argv[]) unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(dynamic, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - if (y) y[j] = M_PI * randm11r(&se); - if (z) z[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + if (y) y[j] = PI * randm11r(&se); + if (z) z[j] = PI * randm11r(&se); } #pragma omp for schedule(dynamic, TEST_RANDCHUNK) for (BIGINT i = 0; i < ntransf * M; i++) // random strengths diff --git a/perftest/manysmallprobs.cpp b/perftest/manysmallprobs.cpp index ded866ba0..f0e4c29ae 100644 --- a/perftest/manysmallprobs.cpp +++ b/perftest/manysmallprobs.cpp @@ -40,7 +40,7 @@ int main(int argc, char *argv[]) double *x = (double *)malloc(sizeof(double) * M); complex *c = (complex *)malloc(sizeof(complex) * M); for (int j = 0; j < M; ++j) { - x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi] + x[j] = PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi] c[j] = 2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1); } @@ -52,8 +52,8 @@ int main(int argc, char *argv[]) timer.start(); for (int r = 0; r < reps; ++r) { // call the NUFFT (with iflag=+1): // printf("rep %d\n",r); - x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around - c[0] = (1.0 + I) * (double)r / (double)reps; // one coeff also jiggles + x[0] = PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around + c[0] = (1.0 + I) * (double)r / (double)reps; // one coeff also jiggles ier = finufft1d1(M, x, c, +1, acc, N, F, NULL); } // (note this can't use the many-vectors interface since the NU change) @@ -72,7 +72,7 @@ int main(int argc, char *argv[]) int ntransf = 1; // since we do one at a time (neq reps) finufft_makeplan(1, 1, Ns, +1, ntransf, acc, &plan, &opts); for (int r = 0; r < reps; ++r) { // set the pts and execute - x[0] = M_PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around + x[0] = PI * (-1.0 + 2 * (double)r / (double)reps); // one source jiggles around // (of course if most sources *were* in fact fixed, use ZGEMM for them!) finufft_setpts(plan, M, x, NULL, NULL, 0, NULL, NULL, NULL); c[0] = (1.0 + I) * (double)r / (double)reps; // one coeff also jiggles diff --git a/perftest/perftest.cpp b/perftest/perftest.cpp index 3e933641f..041137bed 100644 --- a/perftest/perftest.cpp +++ b/perftest/perftest.cpp @@ -190,9 +190,9 @@ template void run_test(test_options_t &test_opts) { // Making data for (int64_t i = 0; i < M; i++) { - x[i] = M_PI * randm11(); // x in [-pi,pi) - y[i] = M_PI * randm11(); - z[i] = M_PI * randm11(); + x[i] = PI * randm11(); // x in [-pi,pi) + y[i] = PI * randm11(); + z[i] = PI * randm11(); } for (int64_t i = M; i < M * ntransf; ++i) { int64_t j = i % M; @@ -218,9 +218,9 @@ template void run_test(test_options_t &test_opts) { c[i].imag(randm11()); } for (int i = 0; i < N * ntransf; i++) { - s[i] = M_PI * randm11() * test_opts.bandwidth; - t[i] = M_PI * randm11() * test_opts.bandwidth; - u[i] = M_PI * randm11() * test_opts.bandwidth; + s[i] = PI * randm11() * test_opts.bandwidth; + t[i] = PI * randm11() * test_opts.bandwidth; + u[i] = PI * randm11() * test_opts.bandwidth; } } else { diff --git a/perftest/results/nuffttestnd_results.txt b/perftest/results/nuffttestnd_results.txt new file mode 100644 index 000000000..4b69b413a --- /dev/null +++ b/perftest/results/nuffttestnd_results.txt @@ -0,0 +1,103 @@ +nuffttestnd output: +what CPUs do I have?... +(I'm in a linux OS) +model name : AMD Ryzen 7 4800H with Radeon Graphics +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Address sizes: 48 bits physical, 48 bits virtual +Byte Order: Little Endian +CPU(s): 16 +On-line CPU(s) list: 0-15 +Vendor ID: AuthenticAMD +Model name: AMD Ryzen 7 4800H with Radeon Graphics +CPU family: 23 +Model: 96 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +Stepping: 1 +Frequency boost: enabled +CPU(s) scaling MHz: 97% +CPU max MHz: 2900,0000 +CPU min MHz: 1400,0000 +BogoMIPS: 5789,17 +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca +Virtualization: AMD-V +L1d cache: 256 KiB (8 instances) +L1i cache: 256 KiB (8 instances) +L2 cache: 4 MiB (8 instances) +L3 cache: 8 MiB (2 instances) +NUMA node(s): 1 +NUMA node0 CPU(s): 0-15 +Vulnerability Gather data sampling: Not affected +Vulnerability Itlb multihit: Not affected +Vulnerability L1tf: Not affected +Vulnerability Mds: Not affected +Vulnerability Meltdown: Not affected +Vulnerability Mmio stale data: Not affected +Vulnerability Reg file data sampling: Not affected +Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection +Vulnerability Spec rstack overflow: Mitigation; Safe RET +Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl +Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +Vulnerability Srbds: Not affected +Vulnerability Tsx async abort: Not affected + +double-precision 8-thread tests: size = 1e6, tol = 1e-6... +test 1d type 1: + 1000000 NU pts to 1000000 modes in 0.053 s 1.89e+07 NU pts/s + one mode: rel err in F[370000] is 6.59e-08 +test 1d type 2: + 1000000 modes to 1000000 NU pts in 0.037 s 2.7e+07 NU pts/s + one targ: rel err in c[500000] is 1.87e-07 +test 1d type 3: + 1000000 NU to 1000000 NU in 0.183 s 1.09e+07 tot NU pts/s + one targ: rel err in F[500000] is 5.93e-07 +test 2d type 1: + 1000000 NU pts to (500,2000) modes in 0.0616 s 1.62e+07 NU pts/s + one mode: rel err in F[185,520] is 2.92e-09 +test 2d type 2: + (500,2000) modes to 1000000 NU pts in 0.0426 s 2.35e+07 NU pts/s + one targ: rel err in c[500000] is 6.2e-07 +test 2d type 3: + 1000000 NU to 1000000 NU in 0.154 s 1.3e+07 tot NU pts/s + one targ: rel err in F[500000] is 3.76e-07 +test 3d type 1: + 1000000 NU pts to (100,200,50) modes in 0.202 s 4.96e+06 NU pts/s + one mode: rel err in F[37,52,-19] is 2.01e-07 +test 3d type 2: + (100,200,50) modes to 1000000 NU pts in 0.103 s 9.73e+06 NU pts/s + one targ: rel err in c[500000] is 6.76e-07 +test 3d type 3: + 1000000 NU to 1000000 NU in 0.371 s 5.39e+06 tot NU pts/s + one targ: rel err in F[500000] is 1.35e-07 + +double-precision 1-thread tests: size = 1e6, tol = 1e-6... +test 1d type 1: + 1000000 NU pts to 1000000 modes in 0.0919 s 1.09e+07 NU pts/s + one mode: rel err in F[370000] is 6.59e-08 +test 1d type 2: + 1000000 modes to 1000000 NU pts in 0.115 s 8.68e+06 NU pts/s + one targ: rel err in c[500000] is 1.87e-07 +test 1d type 3: + 1000000 NU to 1000000 NU in 0.379 s 5.27e+06 tot NU pts/s + one targ: rel err in F[500000] is 5.93e-07 +test 2d type 1: + 1000000 NU pts to (500,2000) modes in 0.126 s 7.91e+06 NU pts/s + one mode: rel err in F[185,520] is 2.92e-09 +test 2d type 2: + (500,2000) modes to 1000000 NU pts in 0.136 s 7.34e+06 NU pts/s + one targ: rel err in c[500000] is 6.2e-07 +test 2d type 3: + 1000000 NU to 1000000 NU in 0.565 s 3.54e+06 tot NU pts/s + one targ: rel err in F[500000] is 3.76e-07 +test 3d type 1: + 1000000 NU pts to (100,200,50) modes in 0.417 s 2.4e+06 NU pts/s + one mode: rel err in F[37,52,-19] is 2.01e-07 +test 3d type 2: + (100,200,50) modes to 1000000 NU pts in 0.357 s 2.8e+06 NU pts/s + one targ: rel err in c[500000] is 6.76e-07 +test 3d type 3: + 1000000 NU to 1000000 NU in 1.26 s 1.59e+06 tot NU pts/s + one targ: rel err in F[500000] is 1.35e-07 diff --git a/perftest/results/nuffttestndf_results.txt b/perftest/results/nuffttestndf_results.txt new file mode 100644 index 000000000..f7aa062fd --- /dev/null +++ b/perftest/results/nuffttestndf_results.txt @@ -0,0 +1,103 @@ +nuffttestnd output: +what CPUs do I have?... +(I'm in a linux OS) +model name : AMD Ryzen 7 4800H with Radeon Graphics +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Address sizes: 48 bits physical, 48 bits virtual +Byte Order: Little Endian +CPU(s): 16 +On-line CPU(s) list: 0-15 +Vendor ID: AuthenticAMD +Model name: AMD Ryzen 7 4800H with Radeon Graphics +CPU family: 23 +Model: 96 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +Stepping: 1 +Frequency boost: enabled +CPU(s) scaling MHz: 71% +CPU max MHz: 2900,0000 +CPU min MHz: 1400,0000 +BogoMIPS: 5789,17 +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca +Virtualization: AMD-V +L1d cache: 256 KiB (8 instances) +L1i cache: 256 KiB (8 instances) +L2 cache: 4 MiB (8 instances) +L3 cache: 8 MiB (2 instances) +NUMA node(s): 1 +NUMA node0 CPU(s): 0-15 +Vulnerability Gather data sampling: Not affected +Vulnerability Itlb multihit: Not affected +Vulnerability L1tf: Not affected +Vulnerability Mds: Not affected +Vulnerability Meltdown: Not affected +Vulnerability Mmio stale data: Not affected +Vulnerability Reg file data sampling: Not affected +Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection +Vulnerability Spec rstack overflow: Mitigation; Safe RET +Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl +Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +Vulnerability Srbds: Not affected +Vulnerability Tsx async abort: Not affected + +single-precision 8-thread tests: size = 1e6, tol = 1e-6... +test 1d type 1: + 1000000 NU pts to 1000000 modes in 0.0366 s 2.73e+07 NU pts/s + one mode: rel err in F[370000] is 0.00314 +test 1d type 2: + 1000000 modes to 1000000 NU pts in 0.0222 s 4.5e+07 NU pts/s + one targ: rel err in c[500000] is 0.0043 +test 1d type 3: + 1000000 NU to 1000000 NU in 0.101 s 1.98e+07 tot NU pts/s + one targ: rel err in F[500000] is -nan +test 2d type 1: + 1000000 NU pts to (500,2000) modes in 0.0346 s 2.89e+07 NU pts/s + one mode: rel err in F[185,520] is 8.9e-07 +test 2d type 2: + (500,2000) modes to 1000000 NU pts in 0.0228 s 4.39e+07 NU pts/s + one targ: rel err in c[500000] is 4.24e-05 +test 2d type 3: + 1000000 NU to 1000000 NU in 0.127 s 1.57e+07 tot NU pts/s + one targ: rel err in F[500000] is 9.8e-05 +test 3d type 1: + 1000000 NU pts to (100,200,50) modes in 0.143 s 6.97e+06 NU pts/s + one mode: rel err in F[37,52,-19] is 4.7e-06 +test 3d type 2: + (100,200,50) modes to 1000000 NU pts in 0.0707 s 1.41e+07 NU pts/s + one targ: rel err in c[500000] is 3.65e-06 +test 3d type 3: + 1000000 NU to 1000000 NU in 0.233 s 8.58e+06 tot NU pts/s + one targ: rel err in F[500000] is 1.36e-05 + +single-precision 1-thread tests: size = 1e6, tol = 1e-6... +test 1d type 1: + 1000000 NU pts to 1000000 modes in 0.0709 s 1.41e+07 NU pts/s + one mode: rel err in F[370000] is 0.00294 +test 1d type 2: + 1000000 modes to 1000000 NU pts in 0.0722 s 1.39e+07 NU pts/s + one targ: rel err in c[500000] is 0.00389 +test 1d type 3: + 1000000 NU to 1000000 NU in 0.288 s 6.95e+06 tot NU pts/s + one targ: rel err in F[500000] is 0.0235 +test 2d type 1: + 1000000 NU pts to (500,2000) modes in 0.0868 s 1.15e+07 NU pts/s + one mode: rel err in F[185,520] is 1.53e-06 +test 2d type 2: + (500,2000) modes to 1000000 NU pts in 0.0869 s 1.15e+07 NU pts/s + one targ: rel err in c[500000] is 3.15e-05 +test 2d type 3: + 1000000 NU to 1000000 NU in 0.466 s 4.29e+06 tot NU pts/s + one targ: rel err in F[500000] is 5.02e-05 +test 3d type 1: + 1000000 NU pts to (100,200,50) modes in 0.259 s 3.86e+06 NU pts/s + one mode: rel err in F[37,52,-19] is 5.01e-06 +test 3d type 2: + (100,200,50) modes to 1000000 NU pts in 0.247 s 4.05e+06 NU pts/s + one targ: rel err in c[500000] is 3.27e-06 +test 3d type 3: + 1000000 NU to 1000000 NU in 0.961 s 2.08e+06 tot NU pts/s + one targ: rel err in F[500000] is 1.32e-05 diff --git a/perftest/results/spreadtestnd_results.txt b/perftest/results/spreadtestnd_results.txt new file mode 100644 index 000000000..6eadba481 --- /dev/null +++ b/perftest/results/spreadtestnd_results.txt @@ -0,0 +1,97 @@ +spreadtestnd output: +what CPUs do I have?... +(I'm in a linux OS) +model name : AMD Ryzen 7 4800H with Radeon Graphics +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Address sizes: 48 bits physical, 48 bits virtual +Byte Order: Little Endian +CPU(s): 16 +On-line CPU(s) list: 0-15 +Vendor ID: AuthenticAMD +Model name: AMD Ryzen 7 4800H with Radeon Graphics +CPU family: 23 +Model: 96 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +Stepping: 1 +Frequency boost: enabled +CPU(s) scaling MHz: 134% +CPU max MHz: 2900,0000 +CPU min MHz: 1400,0000 +BogoMIPS: 5789,17 +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca +Virtualization: AMD-V +L1d cache: 256 KiB (8 instances) +L1i cache: 256 KiB (8 instances) +L2 cache: 4 MiB (8 instances) +L3 cache: 8 MiB (2 instances) +NUMA node(s): 1 +NUMA node0 CPU(s): 0-15 +Vulnerability Gather data sampling: Not affected +Vulnerability Itlb multihit: Not affected +Vulnerability L1tf: Not affected +Vulnerability Mds: Not affected +Vulnerability Meltdown: Not affected +Vulnerability Mmio stale data: Not affected +Vulnerability Reg file data sampling: Not affected +Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection +Vulnerability Spec rstack overflow: Mitigation; Safe RET +Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl +Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +Vulnerability Srbds: Not affected +Vulnerability Tsx async abort: Not affected + +double-precision 8-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... +making random data... +spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0166 s 6.04e+07 pts/s 4.23e+08 spread pts/s + rel err in total over grid: 6.24e-07 +making more random NU pts... +spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0116 s 8.62e+07 pts/s 6.03e+08 spread pts/s + max rel err in values at NU pts: 1.13e-06 +making random data... +spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0314 s 3.18e+07 pts/s 1.56e+09 spread pts/s + rel err in total over grid: 4.55e-07 +making more random NU pts... +spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0171 s 5.85e+07 pts/s 2.87e+09 spread pts/s + max rel err in values at NU pts: 2.27e-06 +making random data... +spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0592 s 1.69e+07 pts/s 5.79e+09 spread pts/s + rel err in total over grid: 1.46e-06 +making more random NU pts... +spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0311 s 3.21e+07 pts/s 1.1e+10 spread pts/s + max rel err in values at NU pts: 3.33e-06 + +double-precision 1-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... +making random data... +spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0316 s 3.16e+07 pts/s 2.21e+08 spread pts/s + rel err in total over grid: 8.07e-07 +making more random NU pts... +spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0468 s 2.14e+07 pts/s 1.5e+08 spread pts/s + max rel err in values at NU pts: 1.13e-06 +making random data... +spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0584 s 1.71e+07 pts/s 8.39e+08 spread pts/s + rel err in total over grid: 1.05e-06 +making more random NU pts... +spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.074 s 1.35e+07 pts/s 6.62e+08 spread pts/s + max rel err in values at NU pts: 2.26e-06 +making random data... +spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.176 s 5.69e+06 pts/s 1.95e+09 spread pts/s + rel err in total over grid: 1.93e-06 +making more random NU pts... +spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.168 s 5.94e+06 pts/s 2.04e+09 spread pts/s + max rel err in values at NU pts: 3.37e-06 diff --git a/perftest/results/spreadtestndf_results.txt b/perftest/results/spreadtestndf_results.txt new file mode 100644 index 000000000..236ff9b82 --- /dev/null +++ b/perftest/results/spreadtestndf_results.txt @@ -0,0 +1,97 @@ +spreadtestnd output: +what CPUs do I have?... +(I'm in a linux OS) +model name : AMD Ryzen 7 4800H with Radeon Graphics +Architecture: x86_64 +CPU op-mode(s): 32-bit, 64-bit +Address sizes: 48 bits physical, 48 bits virtual +Byte Order: Little Endian +CPU(s): 16 +On-line CPU(s) list: 0-15 +Vendor ID: AuthenticAMD +Model name: AMD Ryzen 7 4800H with Radeon Graphics +CPU family: 23 +Model: 96 +Thread(s) per core: 2 +Core(s) per socket: 8 +Socket(s): 1 +Stepping: 1 +Frequency boost: enabled +CPU(s) scaling MHz: 64% +CPU max MHz: 2900,0000 +CPU min MHz: 1400,0000 +BogoMIPS: 5789,17 +Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca +Virtualization: AMD-V +L1d cache: 256 KiB (8 instances) +L1i cache: 256 KiB (8 instances) +L2 cache: 4 MiB (8 instances) +L3 cache: 8 MiB (2 instances) +NUMA node(s): 1 +NUMA node0 CPU(s): 0-15 +Vulnerability Gather data sampling: Not affected +Vulnerability Itlb multihit: Not affected +Vulnerability L1tf: Not affected +Vulnerability Mds: Not affected +Vulnerability Meltdown: Not affected +Vulnerability Mmio stale data: Not affected +Vulnerability Reg file data sampling: Not affected +Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection +Vulnerability Spec rstack overflow: Mitigation; Safe RET +Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl +Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization +Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected +Vulnerability Srbds: Not affected +Vulnerability Tsx async abort: Not affected + +single-precision 8-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... +making random data... +spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0109 s 9.14e+07 pts/s 6.4e+08 spread pts/s + rel err in total over grid: 1.31e-05 +making more random NU pts... +spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.00645 s 1.55e+08 pts/s 1.09e+09 spread pts/s + max rel err in values at NU pts: 1.23e-06 +making random data... +spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0202 s 4.94e+07 pts/s 2.42e+09 spread pts/s + rel err in total over grid: 2.96e-05 +making more random NU pts... +spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0114 s 8.76e+07 pts/s 4.29e+09 spread pts/s + max rel err in values at NU pts: 2.41e-06 +making random data... +spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0343 s 2.92e+07 pts/s 1e+10 spread pts/s + rel err in total over grid: 1.33e-05 +making more random NU pts... +spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0227 s 4.4e+07 pts/s 1.51e+10 spread pts/s + max rel err in values at NU pts: 3.63e-06 + +single-precision 1-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... +making random data... +spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.026 s 3.85e+07 pts/s 2.7e+08 spread pts/s + rel err in total over grid: 3.33e-05 +making more random NU pts... +spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0335 s 2.99e+07 pts/s 2.09e+08 spread pts/s + max rel err in values at NU pts: 1.23e-06 +making random data... +spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0492 s 2.03e+07 pts/s 9.97e+08 spread pts/s + rel err in total over grid: 5.51e-06 +making more random NU pts... +spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.0555 s 1.8e+07 pts/s 8.83e+08 spread pts/s + max rel err in values at NU pts: 2.3e-06 +making random data... +spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.109 s 9.15e+06 pts/s 3.14e+09 spread pts/s + rel err in total over grid: 3.83e-06 +making more random NU pts... +spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 + 1e+06 NU pts in 0.133 s 7.53e+06 pts/s 2.58e+09 spread pts/s + max rel err in values at NU pts: 3.82e-06 diff --git a/perftest/spreadtestnd.cpp b/perftest/spreadtestnd.cpp index 8c35828bd..5aab26fb3 100644 --- a/perftest/spreadtestnd.cpp +++ b/perftest/spreadtestnd.cpp @@ -180,10 +180,10 @@ int main(int argc, char *argv[]) unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim) for (BIGINT i = 0; i < M; ++i) { - kx[i] = randm11r(&se) * 3 * M_PI; + kx[i] = randm11r(&se) * 3 * PI; // kx[i]=2.0*kx[i] - 50.0; //// to test folding within +-1 period - if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords - if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + if (d > 1) ky[i] = randm11r(&se) * 3 * PI; // only fill needed coords + if (d > 2) kz[i] = randm11r(&se) * 3 * PI; d_nonuniform[i * 2] = randm11r(&se); d_nonuniform[i * 2 + 1] = randm11r(&se); strre += d_nonuniform[2 * i]; @@ -248,9 +248,9 @@ int main(int argc, char *argv[]) #pragma omp for schedule(dynamic, 1000000) for (BIGINT i = 0; i < M; ++i) { // random target pts // kx[i]=10+.9*rand01r(&s)*N; // or if want to keep ns away from edges - kx[i] = randm11r(&se) * 3 * M_PI; - if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; - if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + kx[i] = randm11r(&se) * 3 * PI; + if (d > 1) ky[i] = randm11r(&se) * 3 * PI; + if (d > 2) kz[i] = randm11r(&se) * 3 * PI; } } diff --git a/perftest/spreadtestndall.cpp b/perftest/spreadtestndall.cpp index 9787d86b0..950c3526e 100644 --- a/perftest/spreadtestndall.cpp +++ b/perftest/spreadtestndall.cpp @@ -161,8 +161,8 @@ int main(int argc, char *argv[]) opts.spread_direction = 1; d_nonuniform[0] = 1.0; - d_nonuniform[1] = 0.0; // unit strength - kx[0] = ky[0] = kz[0] = M_PI / 2.0; // at center + d_nonuniform[1] = 0.0; // unit strength + kx[0] = ky[0] = kz[0] = PI / 2.0; // at center int ier = spreadinterp(N, N2, N3, d_uniform.data(), 1, kx.data(), ky.data(), kz.data(), d_nonuniform.data(), opts); // vector::data officially C++11 but works @@ -184,10 +184,10 @@ int main(int argc, char *argv[]) unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(dynamic, 1000000) reduction(+ : strre, strim) for (BIGINT i = 0; i < M; ++i) { - kx[i] = randm11r(&se) * 3 * M_PI; + kx[i] = randm11r(&se) * 3 * PI; // kx[i]=2.0*kx[i] - 50.0; //// to test folding within +-1 period - if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; // only fill needed coords - if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + if (d > 1) ky[i] = randm11r(&se) * 3 * PI; // only fill needed coords + if (d > 2) kz[i] = randm11r(&se) * 3 * PI; d_nonuniform[i * 2] = randm11r(&se); d_nonuniform[i * 2 + 1] = randm11r(&se); strre += d_nonuniform[2 * i]; @@ -237,9 +237,9 @@ int main(int argc, char *argv[]) #pragma omp for schedule(dynamic, 1000000) for (BIGINT i = 0; i < M; ++i) { // random target pts // kx[i]=10+.9*rand01r(&s)*N; // or if want to keep ns away from edges - kx[i] = randm11r(&se) * 3 * M_PI; - if (d > 1) ky[i] = randm11r(&se) * 3 * M_PI; - if (d > 2) kz[i] = randm11r(&se) * 3 * M_PI; + kx[i] = randm11r(&se) * 3 * PI; + if (d > 1) ky[i] = randm11r(&se) * 3 * PI; + if (d > 2) kz[i] = randm11r(&se) * 3 * PI; } } diff --git a/src/fft.cpp b/src/fft.cpp index 68877cacd..958d62953 100644 --- a/src/fft.cpp +++ b/src/fft.cpp @@ -34,7 +34,7 @@ template void do_fft(FINUFFT_PLAN_T *p) { arrdims.push_back(size_t(ns[2])); axes.push_back(3); } - ducc0::vfmav> data(p->fwBatch, arrdims); + ducc0::vfmav> data(p->fwBatch.data(), arrdims); #ifdef FINUFFT_NO_DUCC0_TWEAKS ducc0::c2c(data, data, axes, p->fftSign < 0, TF(1), nthreads); #else diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index 464ac5446..bda9f3194 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -74,8 +74,6 @@ Design notes for guru interface implementation: namespace finufft { namespace common { -static constexpr double PI = 3.14159265358979329; - static int set_nf_type12(BIGINT ms, const finufft_opts &opts, const finufft_spread_opts &spopts, BIGINT *nf) // Type 1 & 2 recipe for how to set 1d size of upsampled array, nf, given opts @@ -444,8 +442,9 @@ static int spreadinterpSortedBatch(int batchSize, FINUFFT_PLAN_T *p, #endif #pragma omp parallel for num_threads(nthr_outer) for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch + std::complex *fwi = p->fwBatch.data() + i * p->nf; // start of i'th fw array in + // wkspace + std::complex *ci = cBatch + i * p->nj; // start of i'th c array in cBatch spreadinterpSorted(p->sortIndices, p->nf1, p->nf2, p->nf3, (T *)fwi, p->nj, p->X, p->Y, p->Z, (T *)ci, p->spopts, p->didSort); } @@ -468,8 +467,9 @@ static int deconvolveBatch(int batchSize, FINUFFT_PLAN_T *p, std::complex // since deconvolveshuffle?d are single-thread, omp par seems to help here... #pragma omp parallel for num_threads(batchSize) for (int i = 0; i < batchSize; i++) { - std::complex *fwi = p->fwBatch + i * p->nf; // start of i'th fw array in wkspace - std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch + std::complex *fwi = p->fwBatch.data() + i * p->nf; // start of i'th fw array in + // wkspace + std::complex *fki = fkBatch + i * p->N; // start of i'th fk array in fkBatch // Call routine from common.cpp for the dim; prefactors hardcoded to 1.0... if (p->dim == 1) @@ -709,20 +709,15 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i } timer.restart(); - fwBatch = fftPlan->alloc_complex(nf * batchSize); // the big workspace + fwBatch.resize(nf * batchSize); // the big workspace if (opts.debug) printf("[%s] fwBatch %.2fGB alloc: \t%.3g s\n", __func__, (double)1E-09 * sizeof(std::complex) * nf * batchSize, timer.elapsedsec()); - if (!fwBatch) { // we don't catch all such allocs, just this big one - fprintf(stderr, "[%s] FFT allocation failed for fwBatch (working fine grids)!\n", - __func__); - throw int(FINUFFT_ERR_ALLOC); - } timer.restart(); // plan the FFTW const auto ns = gridsize_for_fft(this); - fftPlan->plan(ns, batchSize, fwBatch, fftSign, opts.fftw, nthr_fft); + fftPlan->plan(ns, batchSize, fwBatch.data(), fftSign, opts.fftw, nthr_fft); if (opts.debug) printf("[%s] FFT plan (mode %d, nthr=%d):\t%.3g s\n", __func__, opts.fftw, nthr_fft, timer.elapsedsec()); @@ -730,9 +725,6 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i } else { // -------------------------- type 3 (no planning) ------------ if (opts.debug) printf("[%s] %dd%d: ntrans=%d\n", __func__, dim, type, ntrans); - // in case destroy occurs before setpts, need safe dummy ptrs/plans... - fwBatch = nullptr; - innerT2plan = nullptr; // Type 3 will call finufft_makeplan for type 2; no need to init FFTW // Note we don't even know nj or nk yet, so can't do anything else! } @@ -861,19 +853,14 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF __func__); return FINUFFT_ERR_MAXNALLOC; } - fftPlan->free(fwBatch); - fwBatch = fftPlan->alloc_complex(nf * batchSize); // maybe big workspace + fwBatch.resize(nf * batchSize); // maybe big workspace - CpBatch.resize(nj * batchSize); // batch c' work + CpBatch.resize(nj * batchSize); // batch c' work if (opts.debug) printf("[%s t3] widcen, batch %.2fGB alloc:\t%.3g s\n", __func__, (double)1E-09 * sizeof(std::complex) * (nf + nj) * batchSize, timer.elapsedsec()); - if (!fwBatch) { - fprintf(stderr, "[%s t3] allocation fail for fwBatch or CpBatch!\n", __func__); - return FINUFFT_ERR_ALLOC; - } // printf("fwbatch, cpbatch ptrs: %llx %llx\n",fwBatch,CpBatch); // alloc rescaled NU src pts x'_j (in X etc), rescaled NU targ pts s'_k ... @@ -1130,7 +1117,7 @@ int FINUFFT_PLAN_T::execute(std::complex *cj, std::complex *fk) { innerT2plan->ntrans = thisBatchSize; // do not try this at home! /* (alarming that FFT not shrunk, but safe, because t2's fwBatch array still the same size, as Andrea explained; just wastes a few flops) */ - innerT2plan->execute(fkb, fwBatch); + innerT2plan->execute(fkb, fwBatch.data()); t_t2 += timer.elapsedsec(); // STEP 3: apply deconvolve (precomputed 1/phiHat(targ_k), phasing too)... timer.restart(); @@ -1165,7 +1152,6 @@ template FINUFFT_PLAN_T::~FINUFFT_PLAN_T() { // Also must not crash if called immediately after finufft_makeplan. // Thus either each thing free'd here is guaranteed to be nullptr or correctly // allocated. - if (fftPlan) fftPlan->free(fwBatch); // free the big FFT (or t3 spread) working array } template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); template FINUFFT_PLAN_T::~FINUFFT_PLAN_T(); diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index a066a499d..97c0126a4 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -219,7 +219,7 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset */ template static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept { - static constexpr const T x2pi = T(M_1_2PI); + static constexpr const T x2pi = T(INV_2PI); const T result = x * x2pi + T(0.5); return (result - floor(result)) * T(N); } @@ -2200,7 +2200,7 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps if (upsampfac == 2.0) // standard sigma (see SISC paper) ns = std::ceil(-log10(eps / (T)10.0)); // 1 digit per power of 10 else // custom sigma - ns = std::ceil(-log(eps) / (T(M_PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 + ns = std::ceil(-log(eps) / (T(PI) * sqrt(1.0 - 1.0 / upsampfac))); // formula, gam=1 ns = max(2, ns); // (we don't have ns=1 version yet) if (ns > MAX_NSPREAD) { // clip to fit allocated arrays, Horner rules if (showwarn) @@ -2222,8 +2222,8 @@ FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps if (ns == 4) betaoverns = 2.38; if (upsampfac != 2.0) { // again, override beta for custom sigma T gamma = 0.97; // must match devel/gen_all_horner_C_code.m ! - betaoverns = gamma * T(M_PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based on - // cutoff + betaoverns = gamma * T(PI) * (1.0 - 1.0 / (2 * upsampfac)); // formula based on + // cutoff } opts.ES_beta = betaoverns * ns; // set the kernel beta parameter if (debug) @@ -2278,7 +2278,7 @@ T evaluate_kernel_horner_kernel(T x, const finufft_spread_opts &opts) } }(); static constexpr auto nc = horner_coeffs.size(); - T res = 0.0; + T res = 0.0; for (uint8_t i = 0; i < ns; i++) { if (x > -opts.ES_halfwidth + i && x <= -opts.ES_halfwidth + i + 1) { T z = std::fma(T(2.0), x - T(i), T(ns - 1)); diff --git a/test/basicpassfail.cpp b/test/basicpassfail.cpp index 9ac1608f7..ce23637db 100644 --- a/test/basicpassfail.cpp +++ b/test/basicpassfail.cpp @@ -14,12 +14,12 @@ int main() { std::vector F(N); // alloc output mode coeffs // Make the input data.................................... - srand(42); // seed (fixed) - std::vector x(M); // NU pts locs - std::vector c(M); // strengths + srand(42); // seed (fixed) + std::vector x(M); // NU pts locs + std::vector c(M); // strengths for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); // uniform random in - // [-pi,pi) + x[j] = PI * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); // uniform random in + // [-pi,pi) c[j] = 2 * ((FLT)rand() / (FLT)RAND_MAX) - 1 + I * (2 * ((FLT)rand() / (FLT)RAND_MAX) - 1); } diff --git a/test/finufft1dmany_test.cpp b/test/finufft1dmany_test.cpp index 1219670a1..f1a250cbb 100644 --- a/test/finufft1dmany_test.cpp +++ b/test/finufft1dmany_test.cpp @@ -56,7 +56,7 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); } #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < ntransf * M; ++j) { @@ -179,9 +179,9 @@ int main(int argc, char *argv[]) { { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) - for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs + for (BIGINT j = 0; j < M; ++j) x[j] = 2.0 + FLT(PI) * randm11r(&se); // new x_j srcs } - FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs + FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs FLT S = (FLT)N / 2; // choose freq range sim to type 1 #pragma omp parallel { diff --git a/test/finufft2d_test.cpp b/test/finufft2d_test.cpp index 50536024f..eb1fec761 100644 --- a/test/finufft2d_test.cpp +++ b/test/finufft2d_test.cpp @@ -53,8 +53,8 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - y[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + y[j] = PI * randm11r(&se); c[j] = crandm11r(&se); } } @@ -138,8 +138,8 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin - y[j] = -3.0 + M_PI * randm11r(&se); // " y_j + x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs, offset from origin + y[j] = -3.0 + PI * randm11r(&se); // " y_j } } FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt) diff --git a/test/finufft2dmany_test.cpp b/test/finufft2dmany_test.cpp index 155c71ae6..640653ac0 100644 --- a/test/finufft2dmany_test.cpp +++ b/test/finufft2dmany_test.cpp @@ -60,8 +60,8 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - y[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + y[j] = PI * randm11r(&se); } #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < ntransf * M; ++j) { @@ -190,8 +190,8 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin - y[j] = -3.0 + M_PI * randm11r(&se); // " y_j + x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs, offset from origin + y[j] = -3.0 + PI * randm11r(&se); // " y_j } } diff --git a/test/finufft3d_test.cpp b/test/finufft3d_test.cpp index fba1b1cae..1e89d471c 100644 --- a/test/finufft3d_test.cpp +++ b/test/finufft3d_test.cpp @@ -58,9 +58,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - y[j] = M_PI * randm11r(&se); - z[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + y[j] = PI * randm11r(&se); + z[j] = PI * randm11r(&se); c[j] = crandm11r(&se); } } @@ -144,9 +144,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin - y[j] = -3.0 + M_PI * randm11r(&se); // " y_j - z[j] = 1.0 + M_PI * randm11r(&se); // " z_j + x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs, offset from origin + y[j] = -3.0 + PI * randm11r(&se); // " y_j + z[j] = 1.0 + PI * randm11r(&se); // " z_j } } FLT *s = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt) diff --git a/test/finufft3dkernel_test.cpp b/test/finufft3dkernel_test.cpp index 60f8f290b..9bc6d1955 100644 --- a/test/finufft3dkernel_test.cpp +++ b/test/finufft3dkernel_test.cpp @@ -74,9 +74,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); // needed for parallel random #s #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - y[j] = M_PI * randm11r(&se); - z[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + y[j] = PI * randm11r(&se); + z[j] = PI * randm11r(&se); c0[j] = crandm11r(&se); } } @@ -143,9 +143,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin - y[j] = -3.0 + M_PI * randm11r(&se); // " y_j - z[j] = 1.0 + M_PI * randm11r(&se); // " z_j + x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs, offset from origin + y[j] = -3.0 + PI * randm11r(&se); // " y_j + z[j] = 1.0 + PI * randm11r(&se); // " z_j } } std::vector s(N); // targ freqs (1-cmpt) diff --git a/test/finufft3dmany_test.cpp b/test/finufft3dmany_test.cpp index ad7c954c6..f00f25e5a 100644 --- a/test/finufft3dmany_test.cpp +++ b/test/finufft3dmany_test.cpp @@ -63,9 +63,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = M_PI * randm11r(&se); - y[j] = M_PI * randm11r(&se); - z[j] = M_PI * randm11r(&se); + x[j] = PI * randm11r(&se); + y[j] = PI * randm11r(&se); + z[j] = PI * randm11r(&se); } #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < ntransf * M; ++j) { @@ -205,9 +205,9 @@ int main(int argc, char *argv[]) { unsigned int se = MY_OMP_GET_THREAD_NUM(); #pragma omp for schedule(static, TEST_RANDCHUNK) for (BIGINT j = 0; j < M; ++j) { - x[j] = 2.0 + M_PI * randm11r(&se); // new x_j srcs, offset from origin - y[j] = -3.0 + M_PI * randm11r(&se); // " y_j - z[j] = 1.0 + M_PI * randm11r(&se); // " z_j + x[j] = 2.0 + PI * randm11r(&se); // new x_j srcs, offset from origin + y[j] = -3.0 + PI * randm11r(&se); // " y_j + z[j] = 1.0 + PI * randm11r(&se); // " z_j } } FLT *s_freq = (FLT *)malloc(sizeof(FLT) * N); // targ freqs (1-cmpt) From a7a8c6c67e1b6f42460d38b8bfb0640ef8f4bf9e Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 28 Oct 2024 08:07:25 +0100 Subject: [PATCH 05/11] small fixes; CMake still broken --- CMakeLists.txt | 1 - perftest/perftest.cpp | 12 ++++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ecee27aee..a36c05892 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -168,7 +168,6 @@ if(FINUFFT_USE_CPU) set(FFTW_VERSION 3.3.10) set(XTL_VERSION 0.7.7) set(XSIMD_VERSION 13.0.0) - # using latest ducc0 version for now as it fixes MacOS GCC build set(DUCC0_VERSION ducc0_0_35_0) set(FINUFFT_FFTW_LIBRARIES) include(cmake/setupXSIMD.cmake) diff --git a/perftest/perftest.cpp b/perftest/perftest.cpp index 041137bed..3e933641f 100644 --- a/perftest/perftest.cpp +++ b/perftest/perftest.cpp @@ -190,9 +190,9 @@ template void run_test(test_options_t &test_opts) { // Making data for (int64_t i = 0; i < M; i++) { - x[i] = PI * randm11(); // x in [-pi,pi) - y[i] = PI * randm11(); - z[i] = PI * randm11(); + x[i] = M_PI * randm11(); // x in [-pi,pi) + y[i] = M_PI * randm11(); + z[i] = M_PI * randm11(); } for (int64_t i = M; i < M * ntransf; ++i) { int64_t j = i % M; @@ -218,9 +218,9 @@ template void run_test(test_options_t &test_opts) { c[i].imag(randm11()); } for (int i = 0; i < N * ntransf; i++) { - s[i] = PI * randm11() * test_opts.bandwidth; - t[i] = PI * randm11() * test_opts.bandwidth; - u[i] = PI * randm11() * test_opts.bandwidth; + s[i] = M_PI * randm11() * test_opts.bandwidth; + t[i] = M_PI * randm11() * test_opts.bandwidth; + u[i] = M_PI * randm11() * test_opts.bandwidth; } } else { From d7461801c31b2f910bb5116b49f3db72c22bb1c4 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 28 Oct 2024 08:25:26 +0100 Subject: [PATCH 06/11] attempt to fix CMake --- perftest/CMakeLists.txt | 2 ++ test/CMakeLists.txt | 2 ++ 2 files changed, 4 insertions(+) diff --git a/perftest/CMakeLists.txt b/perftest/CMakeLists.txt index f0215a6df..29019b0ff 100644 --- a/perftest/CMakeLists.txt +++ b/perftest/CMakeLists.txt @@ -6,6 +6,7 @@ foreach(TEST ${PERFTESTS}) if(FINUFFT_USE_DUCC0) target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0) endif() + target_link_libraries(${TEST} PRIVATE xsimd) finufft_link_test(${TEST}) add_executable(${TEST}f ${TEST}.cpp) @@ -13,6 +14,7 @@ foreach(TEST ${PERFTESTS}) if(FINUFFT_USE_DUCC0) target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0) endif() + target_link_libraries(${TEST}f PRIVATE xsimd) finufft_link_test(${TEST}f) endforeach() diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3736c2caa..50b0210e2 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,11 +13,13 @@ set(TESTS foreach(TEST ${TESTS}) add_executable(${TEST} ${TEST}.cpp) target_compile_features(${TEST} PRIVATE cxx_std_17) + target_link_libraries(${TEST} PRIVATE xsimd) finufft_link_test(${TEST}) add_executable(${TEST}f ${TEST}.cpp) target_compile_definitions(${TEST}f PRIVATE -DSINGLE) target_compile_features(${TEST}f PRIVATE cxx_std_17) + target_link_libraries(${TEST}f PRIVATE xsimd) finufft_link_test(${TEST}f) endforeach() From 0823fc66770aa5a7bb9585a47284bb5a8585e37a Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 28 Oct 2024 08:31:32 +0100 Subject: [PATCH 07/11] more fixes --- test/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 50b0210e2..654dac5e4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -32,6 +32,7 @@ add_executable(testutils testutils.cpp) if(FINUFFT_USE_DUCC0) target_compile_definitions(testutils PRIVATE -DFINUFFT_USE_DUCC0) endif() +target_link_libraries(testutils PRIVATE xsimd) target_compile_features(testutils PRIVATE cxx_std_17) finufft_link_test(testutils) add_test( @@ -42,6 +43,7 @@ add_test( if(NOT FINUFFT_USE_DUCC0) add_executable(fftw_lock_test fftw_lock_test.cpp) target_compile_features(fftw_lock_test PRIVATE cxx_std_17) + target_link_libraries(fftw_lock_test PRIVATE xsimd) finufft_link_test(fftw_lock_test) add_test( From 7e90eb7ee812006a47ab30460e0f05c8ab0ada41 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 28 Oct 2024 08:52:09 +0100 Subject: [PATCH 08/11] make all object files depend on xsimd --- makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/makefile b/makefile index 90e2db250..ae0e8d0d8 100644 --- a/makefile +++ b/makefile @@ -175,7 +175,7 @@ usage: HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) # implicit rules for objects (note -o ensures writes to correct dir) -%.o: %.cpp $(HEADERS) +%.o: %.cpp $(HEADERS) $(XSIMD_DIR)/include/xsimd/xsimd.hpp $(CXX) -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ From 2502e927789a3d393d9f8696b1862064812456e8 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 28 Oct 2024 09:12:53 +0100 Subject: [PATCH 09/11] remove accidentally commited files --- perftest/results/nuffttestnd_results.txt | 103 --------------------- perftest/results/nuffttestndf_results.txt | 103 --------------------- perftest/results/spreadtestnd_results.txt | 97 ------------------- perftest/results/spreadtestndf_results.txt | 97 ------------------- 4 files changed, 400 deletions(-) delete mode 100644 perftest/results/nuffttestnd_results.txt delete mode 100644 perftest/results/nuffttestndf_results.txt delete mode 100644 perftest/results/spreadtestnd_results.txt delete mode 100644 perftest/results/spreadtestndf_results.txt diff --git a/perftest/results/nuffttestnd_results.txt b/perftest/results/nuffttestnd_results.txt deleted file mode 100644 index 4b69b413a..000000000 --- a/perftest/results/nuffttestnd_results.txt +++ /dev/null @@ -1,103 +0,0 @@ -nuffttestnd output: -what CPUs do I have?... -(I'm in a linux OS) -model name : AMD Ryzen 7 4800H with Radeon Graphics -Architecture: x86_64 -CPU op-mode(s): 32-bit, 64-bit -Address sizes: 48 bits physical, 48 bits virtual -Byte Order: Little Endian -CPU(s): 16 -On-line CPU(s) list: 0-15 -Vendor ID: AuthenticAMD -Model name: AMD Ryzen 7 4800H with Radeon Graphics -CPU family: 23 -Model: 96 -Thread(s) per core: 2 -Core(s) per socket: 8 -Socket(s): 1 -Stepping: 1 -Frequency boost: enabled -CPU(s) scaling MHz: 97% -CPU max MHz: 2900,0000 -CPU min MHz: 1400,0000 -BogoMIPS: 5789,17 -Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca -Virtualization: AMD-V -L1d cache: 256 KiB (8 instances) -L1i cache: 256 KiB (8 instances) -L2 cache: 4 MiB (8 instances) -L3 cache: 8 MiB (2 instances) -NUMA node(s): 1 -NUMA node0 CPU(s): 0-15 -Vulnerability Gather data sampling: Not affected -Vulnerability Itlb multihit: Not affected -Vulnerability L1tf: Not affected -Vulnerability Mds: Not affected -Vulnerability Meltdown: Not affected -Vulnerability Mmio stale data: Not affected -Vulnerability Reg file data sampling: Not affected -Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection -Vulnerability Spec rstack overflow: Mitigation; Safe RET -Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl -Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization -Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected -Vulnerability Srbds: Not affected -Vulnerability Tsx async abort: Not affected - -double-precision 8-thread tests: size = 1e6, tol = 1e-6... -test 1d type 1: - 1000000 NU pts to 1000000 modes in 0.053 s 1.89e+07 NU pts/s - one mode: rel err in F[370000] is 6.59e-08 -test 1d type 2: - 1000000 modes to 1000000 NU pts in 0.037 s 2.7e+07 NU pts/s - one targ: rel err in c[500000] is 1.87e-07 -test 1d type 3: - 1000000 NU to 1000000 NU in 0.183 s 1.09e+07 tot NU pts/s - one targ: rel err in F[500000] is 5.93e-07 -test 2d type 1: - 1000000 NU pts to (500,2000) modes in 0.0616 s 1.62e+07 NU pts/s - one mode: rel err in F[185,520] is 2.92e-09 -test 2d type 2: - (500,2000) modes to 1000000 NU pts in 0.0426 s 2.35e+07 NU pts/s - one targ: rel err in c[500000] is 6.2e-07 -test 2d type 3: - 1000000 NU to 1000000 NU in 0.154 s 1.3e+07 tot NU pts/s - one targ: rel err in F[500000] is 3.76e-07 -test 3d type 1: - 1000000 NU pts to (100,200,50) modes in 0.202 s 4.96e+06 NU pts/s - one mode: rel err in F[37,52,-19] is 2.01e-07 -test 3d type 2: - (100,200,50) modes to 1000000 NU pts in 0.103 s 9.73e+06 NU pts/s - one targ: rel err in c[500000] is 6.76e-07 -test 3d type 3: - 1000000 NU to 1000000 NU in 0.371 s 5.39e+06 tot NU pts/s - one targ: rel err in F[500000] is 1.35e-07 - -double-precision 1-thread tests: size = 1e6, tol = 1e-6... -test 1d type 1: - 1000000 NU pts to 1000000 modes in 0.0919 s 1.09e+07 NU pts/s - one mode: rel err in F[370000] is 6.59e-08 -test 1d type 2: - 1000000 modes to 1000000 NU pts in 0.115 s 8.68e+06 NU pts/s - one targ: rel err in c[500000] is 1.87e-07 -test 1d type 3: - 1000000 NU to 1000000 NU in 0.379 s 5.27e+06 tot NU pts/s - one targ: rel err in F[500000] is 5.93e-07 -test 2d type 1: - 1000000 NU pts to (500,2000) modes in 0.126 s 7.91e+06 NU pts/s - one mode: rel err in F[185,520] is 2.92e-09 -test 2d type 2: - (500,2000) modes to 1000000 NU pts in 0.136 s 7.34e+06 NU pts/s - one targ: rel err in c[500000] is 6.2e-07 -test 2d type 3: - 1000000 NU to 1000000 NU in 0.565 s 3.54e+06 tot NU pts/s - one targ: rel err in F[500000] is 3.76e-07 -test 3d type 1: - 1000000 NU pts to (100,200,50) modes in 0.417 s 2.4e+06 NU pts/s - one mode: rel err in F[37,52,-19] is 2.01e-07 -test 3d type 2: - (100,200,50) modes to 1000000 NU pts in 0.357 s 2.8e+06 NU pts/s - one targ: rel err in c[500000] is 6.76e-07 -test 3d type 3: - 1000000 NU to 1000000 NU in 1.26 s 1.59e+06 tot NU pts/s - one targ: rel err in F[500000] is 1.35e-07 diff --git a/perftest/results/nuffttestndf_results.txt b/perftest/results/nuffttestndf_results.txt deleted file mode 100644 index f7aa062fd..000000000 --- a/perftest/results/nuffttestndf_results.txt +++ /dev/null @@ -1,103 +0,0 @@ -nuffttestnd output: -what CPUs do I have?... -(I'm in a linux OS) -model name : AMD Ryzen 7 4800H with Radeon Graphics -Architecture: x86_64 -CPU op-mode(s): 32-bit, 64-bit -Address sizes: 48 bits physical, 48 bits virtual -Byte Order: Little Endian -CPU(s): 16 -On-line CPU(s) list: 0-15 -Vendor ID: AuthenticAMD -Model name: AMD Ryzen 7 4800H with Radeon Graphics -CPU family: 23 -Model: 96 -Thread(s) per core: 2 -Core(s) per socket: 8 -Socket(s): 1 -Stepping: 1 -Frequency boost: enabled -CPU(s) scaling MHz: 71% -CPU max MHz: 2900,0000 -CPU min MHz: 1400,0000 -BogoMIPS: 5789,17 -Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca -Virtualization: AMD-V -L1d cache: 256 KiB (8 instances) -L1i cache: 256 KiB (8 instances) -L2 cache: 4 MiB (8 instances) -L3 cache: 8 MiB (2 instances) -NUMA node(s): 1 -NUMA node0 CPU(s): 0-15 -Vulnerability Gather data sampling: Not affected -Vulnerability Itlb multihit: Not affected -Vulnerability L1tf: Not affected -Vulnerability Mds: Not affected -Vulnerability Meltdown: Not affected -Vulnerability Mmio stale data: Not affected -Vulnerability Reg file data sampling: Not affected -Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection -Vulnerability Spec rstack overflow: Mitigation; Safe RET -Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl -Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization -Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected -Vulnerability Srbds: Not affected -Vulnerability Tsx async abort: Not affected - -single-precision 8-thread tests: size = 1e6, tol = 1e-6... -test 1d type 1: - 1000000 NU pts to 1000000 modes in 0.0366 s 2.73e+07 NU pts/s - one mode: rel err in F[370000] is 0.00314 -test 1d type 2: - 1000000 modes to 1000000 NU pts in 0.0222 s 4.5e+07 NU pts/s - one targ: rel err in c[500000] is 0.0043 -test 1d type 3: - 1000000 NU to 1000000 NU in 0.101 s 1.98e+07 tot NU pts/s - one targ: rel err in F[500000] is -nan -test 2d type 1: - 1000000 NU pts to (500,2000) modes in 0.0346 s 2.89e+07 NU pts/s - one mode: rel err in F[185,520] is 8.9e-07 -test 2d type 2: - (500,2000) modes to 1000000 NU pts in 0.0228 s 4.39e+07 NU pts/s - one targ: rel err in c[500000] is 4.24e-05 -test 2d type 3: - 1000000 NU to 1000000 NU in 0.127 s 1.57e+07 tot NU pts/s - one targ: rel err in F[500000] is 9.8e-05 -test 3d type 1: - 1000000 NU pts to (100,200,50) modes in 0.143 s 6.97e+06 NU pts/s - one mode: rel err in F[37,52,-19] is 4.7e-06 -test 3d type 2: - (100,200,50) modes to 1000000 NU pts in 0.0707 s 1.41e+07 NU pts/s - one targ: rel err in c[500000] is 3.65e-06 -test 3d type 3: - 1000000 NU to 1000000 NU in 0.233 s 8.58e+06 tot NU pts/s - one targ: rel err in F[500000] is 1.36e-05 - -single-precision 1-thread tests: size = 1e6, tol = 1e-6... -test 1d type 1: - 1000000 NU pts to 1000000 modes in 0.0709 s 1.41e+07 NU pts/s - one mode: rel err in F[370000] is 0.00294 -test 1d type 2: - 1000000 modes to 1000000 NU pts in 0.0722 s 1.39e+07 NU pts/s - one targ: rel err in c[500000] is 0.00389 -test 1d type 3: - 1000000 NU to 1000000 NU in 0.288 s 6.95e+06 tot NU pts/s - one targ: rel err in F[500000] is 0.0235 -test 2d type 1: - 1000000 NU pts to (500,2000) modes in 0.0868 s 1.15e+07 NU pts/s - one mode: rel err in F[185,520] is 1.53e-06 -test 2d type 2: - (500,2000) modes to 1000000 NU pts in 0.0869 s 1.15e+07 NU pts/s - one targ: rel err in c[500000] is 3.15e-05 -test 2d type 3: - 1000000 NU to 1000000 NU in 0.466 s 4.29e+06 tot NU pts/s - one targ: rel err in F[500000] is 5.02e-05 -test 3d type 1: - 1000000 NU pts to (100,200,50) modes in 0.259 s 3.86e+06 NU pts/s - one mode: rel err in F[37,52,-19] is 5.01e-06 -test 3d type 2: - (100,200,50) modes to 1000000 NU pts in 0.247 s 4.05e+06 NU pts/s - one targ: rel err in c[500000] is 3.27e-06 -test 3d type 3: - 1000000 NU to 1000000 NU in 0.961 s 2.08e+06 tot NU pts/s - one targ: rel err in F[500000] is 1.32e-05 diff --git a/perftest/results/spreadtestnd_results.txt b/perftest/results/spreadtestnd_results.txt deleted file mode 100644 index 6eadba481..000000000 --- a/perftest/results/spreadtestnd_results.txt +++ /dev/null @@ -1,97 +0,0 @@ -spreadtestnd output: -what CPUs do I have?... -(I'm in a linux OS) -model name : AMD Ryzen 7 4800H with Radeon Graphics -Architecture: x86_64 -CPU op-mode(s): 32-bit, 64-bit -Address sizes: 48 bits physical, 48 bits virtual -Byte Order: Little Endian -CPU(s): 16 -On-line CPU(s) list: 0-15 -Vendor ID: AuthenticAMD -Model name: AMD Ryzen 7 4800H with Radeon Graphics -CPU family: 23 -Model: 96 -Thread(s) per core: 2 -Core(s) per socket: 8 -Socket(s): 1 -Stepping: 1 -Frequency boost: enabled -CPU(s) scaling MHz: 134% -CPU max MHz: 2900,0000 -CPU min MHz: 1400,0000 -BogoMIPS: 5789,17 -Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca -Virtualization: AMD-V -L1d cache: 256 KiB (8 instances) -L1i cache: 256 KiB (8 instances) -L2 cache: 4 MiB (8 instances) -L3 cache: 8 MiB (2 instances) -NUMA node(s): 1 -NUMA node0 CPU(s): 0-15 -Vulnerability Gather data sampling: Not affected -Vulnerability Itlb multihit: Not affected -Vulnerability L1tf: Not affected -Vulnerability Mds: Not affected -Vulnerability Meltdown: Not affected -Vulnerability Mmio stale data: Not affected -Vulnerability Reg file data sampling: Not affected -Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection -Vulnerability Spec rstack overflow: Mitigation; Safe RET -Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl -Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization -Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected -Vulnerability Srbds: Not affected -Vulnerability Tsx async abort: Not affected - -double-precision 8-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... -making random data... -spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0166 s 6.04e+07 pts/s 4.23e+08 spread pts/s - rel err in total over grid: 6.24e-07 -making more random NU pts... -spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0116 s 8.62e+07 pts/s 6.03e+08 spread pts/s - max rel err in values at NU pts: 1.13e-06 -making random data... -spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0314 s 3.18e+07 pts/s 1.56e+09 spread pts/s - rel err in total over grid: 4.55e-07 -making more random NU pts... -spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0171 s 5.85e+07 pts/s 2.87e+09 spread pts/s - max rel err in values at NU pts: 2.27e-06 -making random data... -spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0592 s 1.69e+07 pts/s 5.79e+09 spread pts/s - rel err in total over grid: 1.46e-06 -making more random NU pts... -spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0311 s 3.21e+07 pts/s 1.1e+10 spread pts/s - max rel err in values at NU pts: 3.33e-06 - -double-precision 1-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... -making random data... -spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0316 s 3.16e+07 pts/s 2.21e+08 spread pts/s - rel err in total over grid: 8.07e-07 -making more random NU pts... -spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0468 s 2.14e+07 pts/s 1.5e+08 spread pts/s - max rel err in values at NU pts: 1.13e-06 -making random data... -spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0584 s 1.71e+07 pts/s 8.39e+08 spread pts/s - rel err in total over grid: 1.05e-06 -making more random NU pts... -spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.074 s 1.35e+07 pts/s 6.62e+08 spread pts/s - max rel err in values at NU pts: 2.26e-06 -making random data... -spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.176 s 5.69e+06 pts/s 1.95e+09 spread pts/s - rel err in total over grid: 1.93e-06 -making more random NU pts... -spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.168 s 5.94e+06 pts/s 2.04e+09 spread pts/s - max rel err in values at NU pts: 3.37e-06 diff --git a/perftest/results/spreadtestndf_results.txt b/perftest/results/spreadtestndf_results.txt deleted file mode 100644 index 236ff9b82..000000000 --- a/perftest/results/spreadtestndf_results.txt +++ /dev/null @@ -1,97 +0,0 @@ -spreadtestnd output: -what CPUs do I have?... -(I'm in a linux OS) -model name : AMD Ryzen 7 4800H with Radeon Graphics -Architecture: x86_64 -CPU op-mode(s): 32-bit, 64-bit -Address sizes: 48 bits physical, 48 bits virtual -Byte Order: Little Endian -CPU(s): 16 -On-line CPU(s) list: 0-15 -Vendor ID: AuthenticAMD -Model name: AMD Ryzen 7 4800H with Radeon Graphics -CPU family: 23 -Model: 96 -Thread(s) per core: 2 -Core(s) per socket: 8 -Socket(s): 1 -Stepping: 1 -Frequency boost: enabled -CPU(s) scaling MHz: 64% -CPU max MHz: 2900,0000 -CPU min MHz: 1400,0000 -BogoMIPS: 5789,17 -Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd cppc arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca -Virtualization: AMD-V -L1d cache: 256 KiB (8 instances) -L1i cache: 256 KiB (8 instances) -L2 cache: 4 MiB (8 instances) -L3 cache: 8 MiB (2 instances) -NUMA node(s): 1 -NUMA node0 CPU(s): 0-15 -Vulnerability Gather data sampling: Not affected -Vulnerability Itlb multihit: Not affected -Vulnerability L1tf: Not affected -Vulnerability Mds: Not affected -Vulnerability Meltdown: Not affected -Vulnerability Mmio stale data: Not affected -Vulnerability Reg file data sampling: Not affected -Vulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection -Vulnerability Spec rstack overflow: Mitigation; Safe RET -Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl -Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization -Vulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected -Vulnerability Srbds: Not affected -Vulnerability Tsx async abort: Not affected - -single-precision 8-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... -making random data... -spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0109 s 9.14e+07 pts/s 6.4e+08 spread pts/s - rel err in total over grid: 1.31e-05 -making more random NU pts... -spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.00645 s 1.55e+08 pts/s 1.09e+09 spread pts/s - max rel err in values at NU pts: 1.23e-06 -making random data... -spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0202 s 4.94e+07 pts/s 2.42e+09 spread pts/s - rel err in total over grid: 2.96e-05 -making more random NU pts... -spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0114 s 8.76e+07 pts/s 4.29e+09 spread pts/s - max rel err in values at NU pts: 2.41e-06 -making random data... -spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0343 s 2.92e+07 pts/s 1e+10 spread pts/s - rel err in total over grid: 1.33e-05 -making more random NU pts... -spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0227 s 4.4e+07 pts/s 1.51e+10 spread pts/s - max rel err in values at NU pts: 3.63e-06 - -single-precision 1-thread tests: #NU = 1e6, #U = 1e6, tol = 1e-6... -making random data... -spreadinterp 1D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.026 s 3.85e+07 pts/s 2.7e+08 spread pts/s - rel err in total over grid: 3.33e-05 -making more random NU pts... -spreadinterp 1D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0335 s 2.99e+07 pts/s 2.09e+08 spread pts/s - max rel err in values at NU pts: 1.23e-06 -making random data... -spreadinterp 2D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0492 s 2.03e+07 pts/s 9.97e+08 spread pts/s - rel err in total over grid: 5.51e-06 -making more random NU pts... -spreadinterp 2D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.0555 s 1.8e+07 pts/s 8.83e+08 spread pts/s - max rel err in values at NU pts: 2.3e-06 -making random data... -spreadinterp 3D, 1e+06 U pts, dir=1, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.109 s 9.15e+06 pts/s 3.14e+09 spread pts/s - rel err in total over grid: 3.83e-06 -making more random NU pts... -spreadinterp 3D, 1e+06 U pts, dir=2, tol=1e-06: nspread=7 - 1e+06 NU pts in 0.133 s 7.53e+06 pts/s 2.58e+09 spread pts/s - max rel err in values at NU pts: 3.82e-06 From 3be6f0a058f2f80caeb2dfe2d8fa23948e9845a9 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Mon, 4 Nov 2024 15:42:17 +0100 Subject: [PATCH 10/11] address review comments --- CMakeLists.txt | 2 +- perftest/CMakeLists.txt | 2 -- src/spreadinterp.cpp | 3 +-- test/CMakeLists.txt | 4 ---- 4 files changed, 2 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a36c05892..d23c40e1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,7 +198,7 @@ function(finufft_link_test target) if(FINUFFT_USE_DUCC0) target_compile_definitions(${target} PRIVATE FINUFFT_USE_DUCC0) endif() - target_link_libraries(${target} PRIVATE finufft ${FINUFFT_FFTLIBS}) + target_link_libraries(${target} PRIVATE finufft xsimd ${FINUFFT_FFTLIBS}) if(FINUFFT_USE_OPENMP) target_link_libraries(${target} PRIVATE OpenMP::OpenMP_CXX) target_link_options(${target} PRIVATE ${OpenMP_CXX_FLAGS}) diff --git a/perftest/CMakeLists.txt b/perftest/CMakeLists.txt index 29019b0ff..f0215a6df 100644 --- a/perftest/CMakeLists.txt +++ b/perftest/CMakeLists.txt @@ -6,7 +6,6 @@ foreach(TEST ${PERFTESTS}) if(FINUFFT_USE_DUCC0) target_compile_definitions(${TEST} PRIVATE -DFINUFFT_USE_DUCC0) endif() - target_link_libraries(${TEST} PRIVATE xsimd) finufft_link_test(${TEST}) add_executable(${TEST}f ${TEST}.cpp) @@ -14,7 +13,6 @@ foreach(TEST ${PERFTESTS}) if(FINUFFT_USE_DUCC0) target_compile_definitions(${TEST}f PRIVATE -DFINUFFT_USE_DUCC0) endif() - target_link_libraries(${TEST}f PRIVATE xsimd) finufft_link_test(${TEST}f) endforeach() diff --git a/src/spreadinterp.cpp b/src/spreadinterp.cpp index 97c0126a4..9858f5cc9 100644 --- a/src/spreadinterp.cpp +++ b/src/spreadinterp.cpp @@ -219,8 +219,7 @@ void print_subgrid_info(int ndims, BIGINT offset1, BIGINT offset2, BIGINT offset */ template static FINUFFT_ALWAYS_INLINE T fold_rescale(const T x, const UBIGINT N) noexcept { - static constexpr const T x2pi = T(INV_2PI); - const T result = x * x2pi + T(0.5); + const T result = x * T(INV_2PI) + T(0.5); return (result - floor(result)) * T(N); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 654dac5e4..3736c2caa 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -13,13 +13,11 @@ set(TESTS foreach(TEST ${TESTS}) add_executable(${TEST} ${TEST}.cpp) target_compile_features(${TEST} PRIVATE cxx_std_17) - target_link_libraries(${TEST} PRIVATE xsimd) finufft_link_test(${TEST}) add_executable(${TEST}f ${TEST}.cpp) target_compile_definitions(${TEST}f PRIVATE -DSINGLE) target_compile_features(${TEST}f PRIVATE cxx_std_17) - target_link_libraries(${TEST}f PRIVATE xsimd) finufft_link_test(${TEST}f) endforeach() @@ -32,7 +30,6 @@ add_executable(testutils testutils.cpp) if(FINUFFT_USE_DUCC0) target_compile_definitions(testutils PRIVATE -DFINUFFT_USE_DUCC0) endif() -target_link_libraries(testutils PRIVATE xsimd) target_compile_features(testutils PRIVATE cxx_std_17) finufft_link_test(testutils) add_test( @@ -43,7 +40,6 @@ add_test( if(NOT FINUFFT_USE_DUCC0) add_executable(fftw_lock_test fftw_lock_test.cpp) target_compile_features(fftw_lock_test PRIVATE cxx_std_17) - target_link_libraries(fftw_lock_test PRIVATE xsimd) finufft_link_test(fftw_lock_test) add_test( From cd3ca5aaadaa343e6e51845648aa3efae3215ab0 Mon Sep 17 00:00:00 2001 From: Martin Reinecke Date: Tue, 5 Nov 2024 09:05:32 +0100 Subject: [PATCH 11/11] address review comments --- makefile | 7 +++++-- src/finufft_core.cpp | 6 +++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/makefile b/makefile index c11dc6361..c73c9800d 100644 --- a/makefile +++ b/makefile @@ -175,10 +175,10 @@ usage: @echo "Also see docs/install.rst and docs/README" # collect headers for implicit depends (we don't separate public from private) -HEADERS = $(wildcard include/*.h include/finufft/*.h) $(DUCC_HEADERS) +HEADERS = $(wildcard include/*.h include/finufft/*.h) # implicit rules for objects (note -o ensures writes to correct dir) -%.o: %.cpp $(HEADERS) $(XSIMD_DIR)/include/xsimd/xsimd.hpp +%.o: %.cpp $(HEADERS) $(CXX) -c $(CXXFLAGS) $< -o $@ %.o: %.c $(HEADERS) $(CC) -c $(CFLAGS) $< -o $@ @@ -194,6 +194,9 @@ include/finufft/fft.h: $(DUCC_SETUP) SHEAD = $(wildcard src/*.h) $(XSIMD_DIR)/include/xsimd/xsimd.hpp src/spreadinterp.o: $(SHEAD) +# we need xsimd functionality in finufft_core.h, which is included by many other +# files, so make sure we install xsimd before we prcess any of those files. +include/finufft/finufft_core.h: $(XSIMD_DIR)/include/xsimd/xsimd.hpp # lib ----------------------------------------------------------------------- # build library with double/single prec both bundled in... diff --git a/src/finufft_core.cpp b/src/finufft_core.cpp index bda9f3194..d629c2707 100644 --- a/src/finufft_core.cpp +++ b/src/finufft_core.cpp @@ -548,7 +548,7 @@ FINUFFT_PLAN_T::FINUFFT_PLAN_T(int type_, int dim_, const BIGINT *n_modes, i if (!opts_) // use default opts finufft_default_opts_t(&opts); else // or read from what's passed in - opts = *opts_; // keep a deep copy; changing *opts now has no effect + opts = *opts_; // keep a deep copy; changing *opts_ now has no effect if (opts.debug) // do a hello world printf("[%s] new plan: FINUFFT version " FINUFFT_VER " .................\n", @@ -907,9 +907,9 @@ int FINUFFT_PLAN_T::setpts(BIGINT nj, TF *xj, TF *yj, TF *zj, BIGINT nk, TF } } else for (BIGINT j = 0; j < nj; ++j) - prephase[j] = {1, 0}; // *** or keep flag so no mult in exec?? + prephase[j] = {1.0, 0.0}; // *** or keep flag so no mult in exec?? - // rescale the target s_k etc to s'_k etc... + // rescale the target s_k etc to s'_k etc... #pragma omp parallel for num_threads(opts.nthreads) schedule(static) for (BIGINT k = 0; k < nk; ++k) { Sp[k] = t3P.h1 * t3P.gam1 * (s[k] - t3P.D1); // so |s'_k| < pi/R