Skip to content

Commit

Permalink
Merge pull request #576 from janden/auto_method_type3
Browse files Browse the repository at this point in the history
Automatically set method for GPU type 3
  • Loading branch information
ahbarnett authored Oct 17, 2024
2 parents 4aae46f + f7af341 commit efae920
Show file tree
Hide file tree
Showing 2 changed files with 77 additions and 36 deletions.
76 changes: 40 additions & 36 deletions include/cufinufft/impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
using namespace cufinufft::common;
int ier;
if (type < 1 || type > 3) {
fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
return FINUFFT_ERR_TYPE_NOTVALID;
}
if (ntransf < 1) {
Expand Down Expand Up @@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
}

cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
if (ier = cudaGetLastError(), ier != cudaSuccess) {
if (cudaGetLastError() != cudaSuccess) {
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}
if (d_plan->opts.debug) {
Expand All @@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
}


// dynamically request the maximum amount of shared memory available
// for the spreader

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster
* if there is enough shared memory available. Otherwise, we default to GM.
* Type 3 inherits this behavior since the outer plan here is also a type 1.
*
* For type 2, we always default to method 1 (GM).
*/
if (type == 2) {
d_plan->opts.gpu_method = 1;
} else {
// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block,
cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
// compute the amount of shared memory required for the method
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
} else {
d_plan->opts.gpu_method = 2;
}
}
}

if (cudaGetLastError() != cudaSuccess) {
ier = FINUFFT_ERR_CUDA_FAILURE;
goto finalize;
}

if (type == 1 || type == 2) {
CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
Expand All @@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
d_plan->opts.gpu_obinsizez);

// dynamically request the maximum amount of shared memory available
// for the spreader

/* Automatically set GPU method. */
if (d_plan->opts.gpu_method == 0) {
/* For type 1, we default to method 2 (SM) since this is generally faster
* if there is enough shared memory available. Otherwise, we default to GM.
*
* For type 2, we always default to method 1 (GM).
*/
if (type == 2) {
d_plan->opts.gpu_method = 1;
} else {
// query the device for the amount of shared memory available
int shared_mem_per_block{};
cudaDeviceGetAttribute(&shared_mem_per_block,
cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
// compute the amount of shared memory required for the method
const auto shared_mem_required = shared_memory_required<T>(
dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
if ((shared_mem_required > shared_mem_per_block)) {
d_plan->opts.gpu_method = 1;
} else {
d_plan->opts.gpu_method = 2;
}
}
}

if ((ier = cudaGetLastError())) {
goto finalize;
}

d_plan->nf1 = nf1;
d_plan->nf2 = nf2;
d_plan->nf3 = nf3;
Expand Down Expand Up @@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
cufinufft_opts t2opts = d_plan->opts;
t2opts.gpu_spreadinterponly = 0;
t2opts.gpu_method = 1;
t2opts.gpu_method = 0;
// Safe to ignore the return value here?
if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
// check that maxbatchsize is correct
Expand Down
37 changes: 37 additions & 0 deletions test/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@ foreach(srcfile ${test_src})
endforeach()

function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})
Expand All @@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
${UPSAMP})

add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand All @@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand Down Expand Up @@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
${CHECK_TOL} ${PREC} ${UPSAMP})

add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})
Expand All @@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
${PREC} ${UPSAMP})
endif()

add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL}
${PREC} ${UPSAMP})

add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100
${PREC} ${UPSAMP})
Expand Down

0 comments on commit efae920

Please sign in to comment.