Skip to content

Commit

Permalink
docs: fix URL in c_gpu, and opts explain upsampfac choices better
Browse files Browse the repository at this point in the history
  • Loading branch information
ahbarnett committed Oct 8, 2024
1 parent cf25e43 commit c7d8ac4
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 77 deletions.
2 changes: 1 addition & 1 deletion docs/c_gpu.rst
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ while ``modeord=1`` selects FFT-style ordering starting at zero and wrapping ove

**gpu_device_id**: Sets the GPU device ID. Leave at default unless you know what you're doing. [To be documented]

**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI. (See [MRI-NUFFT](https://mind-inria.github.io/mri-nufft/nufft.html)) Please note that this flag is also internally used by type 3 transforms (which was its original use case).
**gpu_spreadinterponly**: if ``0`` do the NUFFT as intended. If ``1``, omit the FFT and deconvolution (diagonal division by kernel Fourier transform) steps, which returns *garbage answers as a NUFFT*, but allows advanced users to perform an isolated spreading or interpolation using the usual type 1 or type 2 ``cufinufft`` interface. To do this, the nonzero flag value must be used *only* with ``upsampfac=1.0`` (since the input and output grids are the same size, and neither represents Fourier coefficients), and ``kerevalmeth=1``. The known use-case here is estimating so-called density compensation, conventionally used in MRI (see `MRI-NUFFT <https://mind-inria.github.io/mri-nufft/nufft.html>`_). Please note that this flag is also internally used by type 3 transforms (being its original use case).



Expand Down
7 changes: 4 additions & 3 deletions docs/opts.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,11 @@ Here are their default settings (from ``src/finufft.cpp:finufft_default_opts``):

As for quick advice, the main options you'll want to play with are:

- ``upsampfac`` to trade-off between spread/interpolate vs FFT speed and RAM
- ``modeord`` to flip ("fftshift") the Fourier mode ordering
- ``debug`` to look at timing output (to determine if your problem is spread/interpolation dominated, vs FFT dominated)
- ``nthreads`` to run with a different number of threads than the current maximum available through OpenMP (a large number can sometimes be detrimental, and very small problems can sometimes run faster on 1 thread)
- ``fftw`` to try slower plan modes which give faster transforms. The next natural one to try is ``FFTW_MEASURE`` (look at the FFTW3 docs)
- ``fftw`` to try slower FFTW plan modes which give faster transforms. The next natural one to try is ``FFTW_MEASURE`` (look at the FFTW3 docs)

See :ref:`Troubleshooting <trouble>` for good advice on trying options, and read the full options descriptions below.

Expand Down Expand Up @@ -158,9 +159,9 @@ There is thus little reason for the nonexpert to mess with this option.
* ``spread_kerpad=1`` : pad to next multiple of four


**upsampfac**: This is the internal real factor by which the FFT (fine grid)
**upsampfac**: This is the internal factor, greater than 1, by which the FFT (fine grid)
is chosen larger than
the number of requested modes in each dimension, for type 1 and 2 transforms. It must be greater than 1.
the number of requested modes in each dimension, for type 1 and 2 transforms. For type 3 transforms this factor gets squared, due to type 2 nested in a type-1-style spreading operation, so has even more effect.
We have built efficient kernels
for only two settings that are greater than 1, as follows. Otherwise, setting it to zero chooses a good heuristic:

Expand Down
147 changes: 74 additions & 73 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,9 +108,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
d_plan->mt = nmodes[1];
d_plan->mu = nmodes[2];
if (d_plan->opts.debug) {
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
printf("[cufinufft] (ms,mt,mu): %d %d %d\n", d_plan->ms, d_plan->mt, d_plan->mu);
}
} else {
} else { // type 3 turns its outer type 1 into spreading-only
d_plan->opts.gpu_spreadinterponly = 1;
}

Expand Down Expand Up @@ -157,9 +157,9 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}
if (d_plan->opts.gpu_spreadinterponly) {
// XNOR implementation below with boolean logic.
if ((d_plan->opts.upsampfac !=1) == (type != 3)) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
if ((d_plan->opts.upsampfac != 1) == (type != 3)) {
ier = FINUFFT_ERR_SPREADONLY_UPSAMP_INVALID;
goto finalize;
}
}
/* Setup Spreader */
Expand Down Expand Up @@ -262,74 +262,75 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
} break;
}

// We dont need any cuFFT plans or kernel values if we are only spreading / interpolating
if(!d_plan->opts.gpu_spreadinterponly) {
cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}
// We dont need any cuFFT plans or kernel values if we are only spreading /
// interpolating
if (!d_plan->opts.gpu_spreadinterponly) {
cufftHandle fftplan;
cufftResult_t cufft_status;
switch (d_plan->dim) {
case 1: {
int n[] = {(int)nf1};
int inembed[] = {(int)nf1};

cufft_status = cufftPlanMany(&fftplan, 1, n, inembed, 1, inembed[0], inembed, 1,
inembed[0], cufft_type<T>(), batchsize);
} break;
case 2: {
int n[] = {(int)nf2, (int)nf1};
int inembed[] = {(int)nf2, (int)nf1};

cufft_status =
cufftPlanMany(&fftplan, 2, n, inembed, 1, inembed[0] * inembed[1], inembed, 1,
inembed[0] * inembed[1], cufft_type<T>(), batchsize);
} break;
case 3: {
int n[] = {(int)nf3, (int)nf2, (int)nf1};
int inembed[] = {(int)nf3, (int)nf2, (int)nf1};

cufft_status = cufftPlanMany(
&fftplan, 3, n, inembed, 1, inembed[0] * inembed[1] * inembed[2], inembed, 1,
inembed[0] * inembed[1] * inembed[2], cufft_type<T>(), batchsize);
} break;
}

if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD, d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
if (cufft_status != CUFFT_SUCCESS) {
fprintf(stderr, "[%s] cufft makeplan error: %s", __func__,
cufftGetErrorString(cufft_status));
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
cufftSetStream(fftplan, stream);

d_plan->fftplan = fftplan;

// compute up to 3 * NQUAD precomputed values on CPU
T fseries_precomp_phase[3 * MAX_NQUAD];
T fseries_precomp_f[3 * MAX_NQUAD];
thrust::device_vector<T> d_fseries_precomp_phase(3 * MAX_NQUAD);
thrust::device_vector<T> d_fseries_precomp_f(3 * MAX_NQUAD);
onedim_fseries_kernel_precomp<T>(d_plan->nf1, fseries_precomp_f,
fseries_precomp_phase, d_plan->spopts);
if (d_plan->dim > 1)
onedim_fseries_kernel_precomp<T>(d_plan->nf2, fseries_precomp_f + MAX_NQUAD,
fseries_precomp_phase + MAX_NQUAD,
d_plan->spopts);
if (d_plan->dim > 2)
onedim_fseries_kernel_precomp<T>(d_plan->nf3, fseries_precomp_f + 2 * MAX_NQUAD,
fseries_precomp_phase + 2 * MAX_NQUAD,
d_plan->spopts);
// copy the precomputed data to the device using thrust
thrust::copy(fseries_precomp_phase, fseries_precomp_phase + 3 * MAX_NQUAD,
d_fseries_precomp_phase.begin());
thrust::copy(fseries_precomp_f, fseries_precomp_f + 3 * MAX_NQUAD,
d_fseries_precomp_f.begin());
// the full fseries is done on the GPU here
if ((ier = fseries_kernel_compute(
d_plan->dim, d_plan->nf1, d_plan->nf2, d_plan->nf3,
d_fseries_precomp_f.data().get(), d_fseries_precomp_phase.data().get(),
d_plan->fwkerhalf1, d_plan->fwkerhalf2, d_plan->fwkerhalf3,
d_plan->spopts.nspread, stream)))
goto finalize;
}

}
finalize:
if (ier > 1) {
Expand Down Expand Up @@ -760,8 +761,8 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
thrust::cuda::par.on(stream), phase_iterator, phase_iterator + N,
d_plan->deconv, d_plan->deconv,
[c1, c2, c3, d1, d2, d3, realsign] __host__ __device__(
const thrust::tuple<T, T, T> tuple,
cuda_complex<T> deconv) -> cuda_complex<T> {
const thrust::tuple<T, T, T> tuple, cuda_complex<T> deconv)
-> cuda_complex<T> {
// d2 and d3 are 0 if dim < 2 and dim < 3
const auto phase = c1 * (thrust::get<0>(tuple) + d1) +
c2 * (thrust::get<1>(tuple) + d2) +
Expand Down

0 comments on commit c7d8ac4

Please sign in to comment.