flatironinstitute · ahbarnett · Nov 26, 2024 · Dec 6, 2024 · Dec 6, 2024 · Dec 6, 2024
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,11 +1,14 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-Master, using release name V 2.4.0 (1/7/25)
+Master, using release name V 2.4.0 (2/8/25)
+
+* CPU opts.spreadinterponly (experts only), and GPU, logic and docs changed so
+  upsampfac controls kernel shape properly. Add C++/MATLAB demos. #602 (Barnett)
 * PR617: Caching pip dependencies in github actions.
   Forcing Ninja when building python on Windows.
 * PR614: Added support for sccache in github actions.
-  Caching cmake dependencies so to avoid downloading fftw, xsimd, etc. every time.
+  Caching cmake dependencies so to avoid downloading fftw, xsimd, etc every time
 * fully removed chkbnds option (opts and spreadopts) (Barnett)
 * classic GNU makefile settings make.inc.* tidied to make-platforms/ (Barnett)
 * unified separate-dim arrays (eg X,Y,Z->XYZ), simplifiying core (Reinecke #592)

diff --git a/docs/c_gpu.rst b/docs/c_gpu.rst
@@ -315,9 +315,8 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove
 
 **gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]
 
-**gpu_spreadinterponly**: If ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since no upsampling takes place), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI (see `MRI-NUFFT <https://mind-inria.github.io/mri-nufft/nufft.html>`_), although it might also be useful in spectral Ewald. Please note that this flag is also internally used by type 3 transforms (although it was originally a debug flag).
-
-
+**gpu_spreadinterponly**: [Only has effect for type 1 or 2.] For experts only! If ``0`` do the NUFFT as intended. If ``1``, do *only* spreading (if ``type=1``) or *only* interpolation (if ``type=2``), using kernel shape parameters set by ``tol`` and ``upsampfac``; the result is not upsampled and is not a NUFFT.
+It is analogous to the CPU option named :ref:`spreadinterponly<sionly>` (please read that documentation.) [This flag is also internally used for GPU type 3 transforms, although it was originally a debug flag.]
 
 
 Algorithm performance options

diff --git a/docs/conf.py b/docs/conf.py
@@ -56,7 +56,7 @@
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 # source_suffix = ['.rst', '.md']
-source_suffix = '.rst'
+source_suffix = {'.rst': 'restructuredtext'}
 
 # The encoding of source files.
 #source_encoding = 'utf-8-sig'

diff --git a/docs/error.rst b/docs/error.rst
@@ -29,7 +29,6 @@ has the following meanings (see codes in ``include/finufft_errors.h``):
   18 size of bins for subprob/blockgather invalid
   19 GPU shmem too small for subprob/blockgather parameters
   20 invalid number of nonuniform points: nj or nk negative, or too big (see finufft_core.h)
-  23 invalid upsampfac set while using gpu_spreadinterponly mode
 
 When ``ier=1`` (warning only) the transform(s) is/are still completed, at the smallest epsilon achievable, so, with that caveat, the answer should still be usable.
 

diff --git a/docs/matlabhelp.doc b/docs/matlabhelp.doc
@@ -31,6 +31,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
    Outputs:
      f     size-ms complex column vector of Fourier coefficients, or, if
            ntrans>1, a matrix of size (ms,ntrans).
@@ -77,6 +78,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
   Outputs:
      c     complex column vector of nj answers at targets, or,
            if ntrans>1, matrix of size (nj,ntrans).
@@ -173,6 +175,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
    Outputs:
      f     size (ms,mt) complex matrix of Fourier coefficients
            (ordering given by opts.modeord in each dimension; ms fast, mt slow),
@@ -222,6 +225,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
   Outputs:
      c     complex column vector of nj answers at targets, or,
            if ntrans>1, matrix of size (nj,ntrans).
@@ -321,6 +325,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
    Outputs:
      f     size (ms,mt,mu) complex array of Fourier coefficients
            (ordering given by opts.modeord in each dimension; ms fastest, mu
@@ -372,6 +377,7 @@
      opts.maxbatchsize:  for ntrans>1 only. max blocking size, or 0 for auto.
      opts.nthreads:   number of threads, or 0: use all available (default)
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
   Outputs:
      c     complex column vector of nj answers at targets, or,
            if ntrans>1, matrix of size (nj,ntrans).
@@ -493,6 +499,7 @@
      opts.floatprec: library precision to use, 'double' (default) or 'single'.
      for type 1 and 2 only, the following opts fields are also relevant:
      opts.modeord: 0 (CMCL increasing mode ordering, default), 1 (FFT ordering)
+     opts.spreadinterponly: 0 (perform NUFFT, default), 1 (only spread/interp)
  Outputs:
      plan            finufft_plan object (opaque pointer)
 

diff --git a/docs/opts.rst b/docs/opts.rst
@@ -92,6 +92,33 @@ Data handling options
 
   .. note:: The index *sets* are the same in the two ``modeord`` choices; their ordering differs only by a cyclic shift. The FFT ordering cyclically shifts the CMCL indices $\mbox{floor}(N/2)$ to the left (often called an "fftshift").
 
+.. _sionly:
+
+**spreadinterponly**: [only has effect for type 1 or 2.] For experts only!
+If ``0`` do
+the NUFFT as intended.  If ``1``, omit the FFT and deconvolution
+(diagonal division by kernel Fourier transform) steps, thus returning
+*garbage answers as a NUFFT*, but allowing experts to perform solely
+spreading (if type 1) or solely interpolation (if type 2) by hijacking
+the usual FINUFFT API.  The spreading is onto the grid of the
+user-given size (``N1`` in x, ``N2`` in y, etc), with grid points
+located at coordinates $\{-\pi, -\pi+h, \dots, \pi-h\}$ in each
+dimension, where $h = 2\pi/N$ is the spacing for that dimension ($N$
+here meaning ``N1``, etc). Interpolation is from that same grid.  The
+kernel (width and shape parameter) is determined by ``tol`` and
+``opts.upsampfac``, just as it would be in an actual NUFFT. Note that
+the upsampling factor here only controls the kernel; the grid size
+never differs from ``N1``, etc.  The kernel is not directly
+accessible, leaving the user to figure out how to make use of this
+interface to extract the actual kernel function.  This provides a
+convenient (if hacky) interface to our ``spreadinterp`` module
+(including looping over multiple vectors, if ``ntransf>1``).  The
+known use-case here is estimating so-called density compensation,
+conventionally used in MRI (see `MRI-NUFFT
+<https://mind-inria.github.io/mri-nufft/nufft.html>`_), although it
+might also be useful in spectral Ewald.
+
+
 
 Diagnostic options
 ~~~~~~~~~~~~~~~~~~~~~~~
@@ -100,7 +127,7 @@ Diagnostic options
 
 * ``debug=0`` : silent
 
-* ``debug=1`` : print some information
+* ``debug=1`` : prints some information
 
 * ``debug=2`` : prints more information
 

diff --git a/examples/spreadinterponly1d.cpp b/examples/spreadinterponly1d.cpp
@@ -0,0 +1,91 @@
+// this is all you must include for the finufft lib...
+#include <finufft.h>
+
+// also used in this example...
+#include <cassert>
+#include <chrono>
+#include <complex>
+#include <cstdio>
+#include <stdlib.h>
+#include <vector>
+using namespace std;
+using namespace std::chrono;
+
+int main(int argc, char *argv[])
+/* Example of double-prec spread/interp only tasks, with basic math tests.
+   Complex I/O arrays, but recall the kernel is real.  Barnett 1/8/25.
+
+   The math tests are:
+   1) for spread, check sum of spread kernel masses is as expected from sum
+   of strengths (ie testing the zero-frequency component in NUFFT).
+   2) for interp, check each interp kernel mass is the same as from one.
+
+   Without knowing the kernel, this is about all that can be done!
+   (Better math tests would be, ironically, to wrap the spreader/interpolator
+   into a NUFFT and test that :) But we already have that in FINUFFT.)
+
+   Compile and run (static library case):
+
+   g++ spreadinterponly1d.cpp -I../include ../lib-static/libfinufft.a -o
+   spreadinterponly1d -lfftw3 -lfftw3_omp && ./spreadinterponly1d
+
+   See: spreadtestnd for usage of internal (non FINUFFT-API) spread/interp.
+*/
+{
+  int M = 1e7; // number of nonuniform points
+  int N = 1e7; // size of regular grid
+  finufft_opts opts;
+  finufft_default_opts(&opts);
+  opts.spreadinterponly = 1;    // task: the following two control kernel used...
+  double tol            = 1e-9; // tolerance for (real) kernel shape design only
+  opts.upsampfac        = 2.0;  // pretend upsampling factor (really no upsampling)
+  // opts.spread_kerevalmeth = 0;  // would be needed for any nonstd upsampfac
+
+  complex<double> I = complex<double>(0.0, 1.0); // the imaginary unit
+  vector<double> x(M);                           // input
+  vector<complex<double>> c(M);                  // input
+  vector<complex<double>> F(N);                  // output (spread to this array)
+
+  // first spread M=1 single unit-strength at the origin, only to get its total mass...
+  x[0]       = 0.0;
+  c[0]       = 1.0;
+  int unused = 1;
+  int ier    = finufft1d1(1, &x[0], &c[0], unused, tol, N, &F[0], &opts); // warm-up
+  if (ier > 1) return ier;
+  complex<double> kersum = 0.0;
+  for (auto Fk : F) kersum += Fk; // kernel mass
+
+  // Now generate random nonuniform points (x) and complex strengths (c)...
+  for (int j = 0; j < M; ++j) {
+    x[j] = M_PI * (2 * ((double)rand() / RAND_MAX) - 1); // uniform random in [-pi,pi)
+    c[j] =
+        2 * ((double)rand() / RAND_MAX) - 1 + I * (2 * ((double)rand() / RAND_MAX) - 1);
+  }
+
+  opts.debug = 1;
+  auto t0    = steady_clock::now(); // now spread with all M pts... (dir=1)
+  ier        = finufft1d1(M, &x[0], &c[0], unused, tol, N, &F[0], &opts); // do it
+  double t   = (steady_clock::now() - t0) / 1.0s;
+  if (ier > 1) return ier;
+  complex<double> csum = 0.0; // tot input strength
+  for (auto cj : c) csum += cj;
+  complex<double> mass = 0.0; // tot output mass
+  for (auto Fk : F) mass += Fk;
+  double relerr = abs(mass - kersum * csum) / abs(mass);
+  printf("1D spread-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, mass err %.3g\n",
+         t, M / t, ier, relerr);
+
+  for (auto &Fk : F) Fk = complex<double>{1.0, 0.0}; // unit grid input
+  opts.debug = 0;
+  t0         = steady_clock::now(); // now interp to all M pts...  (dir=2)
+  ier        = finufft1d2(M, &x[0], &c[0], unused, tol, N, &F[0], &opts); // do it
+  t          = (steady_clock::now() - t0) / 1.0s;
+  if (ier > 1) return ier;
+  csum = 0.0; // tot output
+  for (auto cj : c) csum += cj;
+  double maxerr = 0.0;
+  for (auto cj : c) maxerr = max(maxerr, abs(cj - kersum));
+  printf("1D interp-only, double-prec, %.3g s (%.3g NU pt/sec), ier=%d, max err %.3g\n",
+         t, M / t, ier, maxerr / abs(kersum));
+  return 0;
+}
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -155,13 +155,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       printf("[cufinufft] upsampfac automatically set to %.3g\n", d_plan->opts.upsampfac);
     }
   }
-  if (d_plan->opts.gpu_spreadinterponly) {
-    // XNOR implementation below with boolean logic.
-    if ((d_plan->opts.upsampfac != 1) == (type != 3)) {
-      ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
-      goto finalize;
-    }
-  }
   /* Setup Spreader */
   if ((ier = setup_spreader_for_nufft(d_plan->spopts, tol, d_plan->opts)) > 1) {
     // can return FINUFFT_WARN_EPS_TOO_SMALL=1, which is OK
@@ -197,7 +190,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
   }
 
-
   // dynamically request the maximum amount of shared memory available
   // for the spreader
 
@@ -235,23 +227,31 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
 
   if (type == 1 || type == 2) {
     CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
-    set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
-                  d_plan->opts.gpu_obinsizex);
-    if (dim > 1)
-      set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
-                    d_plan->opts.gpu_obinsizey);
-    if (dim > 2)
-      set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
-                    d_plan->opts.gpu_obinsizez);
-
+    if (d_plan->opts.gpu_spreadinterponly) {
+      // spread/interp grid is precisely the user "mode" sizes, no upsampling
+      nf1 = d_plan->ms;
+      if (dim > 1) nf2 = d_plan->mt;
+      if (dim > 2) nf3 = d_plan->mu;
+      if (d_plan->opts.debug) {
+        printf("[cufinufft] spreadinterponly mode: (nf1,nf2,nf3) = (%d, %d, %d)\n", nf1,
+               nf2, nf3);
+      }
+    } else { // usual NUFFT with fine grid using upsampling
+      set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
+                    d_plan->opts.gpu_obinsizex);
+      if (dim > 1)
+        set_nf_type12(d_plan->mt, d_plan->opts, d_plan->spopts, &nf2,
+                      d_plan->opts.gpu_obinsizey);
+      if (dim > 2)
+        set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
+                      d_plan->opts.gpu_obinsizez);
+      if (d_plan->opts.debug)
+        printf("[cufinufft] (nf1,nf2,nf3) = (%d, %d, %d)\n", nf1, nf2, nf3);
+    }
     d_plan->nf1 = nf1;
     d_plan->nf2 = nf2;
     d_plan->nf3 = nf3;
     d_plan->nf  = nf1 * nf2 * nf3;
-    if (d_plan->opts.debug) {
-      printf("[cufinufft] (nf1,nf2,nf3) = (%d, %d, %d)\n", d_plan->nf1, d_plan->nf2,
-             d_plan->nf3);
-    }
 
     using namespace cufinufft::memtransfer;
     switch (d_plan->dim) {

diff --git a/include/cufinufft/spreadinterp.h b/include/cufinufft/spreadinterp.h
@@ -72,7 +72,8 @@ static inline T evaluate_kernel(T x, const finufft_spread_opts &opts)
 }
 
 template<typename T>
-int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth);
+int setup_spreader(finufft_spread_opts &opts, T eps, T upsampfac, int kerevalmeth,
+                   int debug, int spreadinterponly);
 
 template<typename T>
 static __forceinline__ __device__ T evaluate_kernel(T x, T es_c, T es_beta, int ns)

diff --git a/include/finufft.fh b/include/finufft.fh
@@ -7,7 +7,7 @@ c     erase chkbnds 1/7/25.
       type finufft_opts
 
 c     data handling opts...
-      integer modeord
+      integer modeord, spreadinterponly
 
 c     diagnostic opts...
       integer debug, spread_debug, showwarn

diff --git a/include/finufft/spreadinterp.h b/include/finufft/spreadinterp.h
@@ -52,11 +52,12 @@ FINUFFT_EXPORT int FINUFFT_CDECL spreadinterpSorted(
 template<typename T>
 FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel(T x, const finufft_spread_opts &opts);
 template<typename T>
-FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel_horner(T x, const finufft_spread_opts &opts);
+FINUFFT_EXPORT T FINUFFT_CDECL evaluate_kernel_horner(T x,
+                                                      const finufft_spread_opts &opts);
 template<typename T>
-FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(finufft_spread_opts &opts, T eps,
-                                                double upsampfac, int kerevalmeth,
-                                                int debug, int showwarn, int dim);
+FINUFFT_EXPORT int FINUFFT_CDECL setup_spreader(
+    finufft_spread_opts &opts, T eps, double upsampfac, int kerevalmeth, int debug,
+    int showwarn, int dim, int spreadinterponly);
 
 } // namespace spreadinterp
 } // namespace finufft

diff --git a/include/finufft/test_defs.h b/include/finufft/test_defs.h
@@ -14,8 +14,9 @@
 // the public interface
 #include <finufft.h>
 
-// convenient private finufft internals
+// convenient private finufft internals that tests need
 #include <finufft/finufft_core.h>
+#include <finufft/finufft_utils.hpp>
 #include <memory>
 
 // --------------- Private data types for compilation in either prec ---------

diff --git a/include/finufft_errors.h b/include/finufft_errors.h
@@ -26,6 +26,5 @@ enum {
   FINUFFT_ERR_NUM_NU_PTS_INVALID     = 20,
   FINUFFT_ERR_INVALID_ARGUMENT       = 21,
   FINUFFT_ERR_LOCK_FUNS_INVALID      = 22,
-  FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID  = 23,
 };
 #endif
diff --git a/include/finufft_mod.f90 b/include/finufft_mod.f90
@@ -8,7 +8,7 @@ module finufft_mod
 type finufft_opts
 
    ! data handling opts...
-   integer(kind=C_INT) :: modeord
+   integer(kind=C_INT) :: modeord, spreadinterponly
 
    ! diagnostic opts...
    integer(kind=C_INT) :: debug, spread_debug, showwarn

diff --git a/include/finufft_opts.h b/include/finufft_opts.h
@@ -9,8 +9,10 @@ typedef struct finufft_opts { // defaults see finufft_core.cpp:finufft_default_o
   // sphinx tag (don't remove): @opts_start
   // FINUFFT options:
   // data handling opts...
-  int modeord; // (type 1,2 only): 0 CMCL-style increasing mode order
-               //                  1 FFT-style mode order
+  int modeord;          // (type 1,2 only): 0 CMCL-style increasing mode order
+                        //                  1 FFT-style mode order
+  int spreadinterponly; // 0 do actual NUFFT
+                        // 1 only spread (if type 1) or interpolate (type 2)
 
   // diagnostic opts...
   int debug;        // 0 silent, 1 some timing/debug, or 2 more