Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Automatically set method for GPU type 3 #576

Merged
merged 8 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 40 additions & 36 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
using namespace cufinufft::common;
int ier;
if (type < 1 || type > 3) {
fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
return FINUFFT_ERR_TYPE_NOTVALID;
}
if (ntransf < 1) {
Expand Down Expand Up @@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}

cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
if (ier = cudaGetLastError(), ier != cudaSuccess) {
if (cudaGetLastError() != cudaSuccess) {
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
if (d_plan->opts.debug) {
Expand All @@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
}


// dynamically request the maximum amount of shared memory available
// for the spreader

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster
* if there is enough shared memory available. Otherwise, we default to GM.
* Type 3 inherits this behavior since the outer plan here is also a type 1.
*
* For type 2, we always default to method 1 (GM).
*/
if (type == 2) {
d_plan->opts.gpu_method = 1;
ahbarnett marked this conversation as resolved.
Show resolved Hide resolved
} else {
// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block,
cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
// compute the amount of shared memory required for the method
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
} else {
d_plan->opts.gpu_method = 2;
}
}
}

if (cudaGetLastError() != cudaSuccess) {
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}

if (type == 1 || type == 2) {
CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
Expand All @@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
d_plan->opts.gpu_obinsizez);

// dynamically request the maximum amount of shared memory available
// for the spreader

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster
* if there is enough shared memory available. Otherwise, we default to GM.
*
* For type 2, we always default to method 1 (GM).
*/
if (type == 2) {
d_plan->opts.gpu_method = 1;
} else {
// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block,
cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
// compute the amount of shared memory required for the method
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
} else {
d_plan->opts.gpu_method = 2;
}
}
}

if ((ier = cudaGetLastError())) {
goto finalize;
}

d_plan->nf1 = nf1;
d_plan->nf2 = nf2;
d_plan->nf3 = nf3;
Expand Down Expand Up @@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
cufinufft_opts t2opts = d_plan->opts;
t2opts.gpu_spreadinterponly = 0;
t2opts.gpu_method = 1;
t2opts.gpu_method = 0;
// Safe to ignore the return value here?
if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
// check that maxbatchsize is correct
Expand Down
37 changes: 37 additions & 0 deletions test/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ foreach(srcfile ${test_src})
endforeach()

function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})
Expand All @@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP}
janden marked this conversation as resolved.
Show resolved Hide resolved
COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand All @@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand Down Expand Up @@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
${CHECK_TOL} ${PREC} ${UPSAMP})

add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand All @@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
${PREC} ${UPSAMP})
endif()

add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100
${PREC} ${UPSAMP})
Expand Down
Loading