diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h index 012050b15..543634f15 100644 --- a/include/cufinufft/impl.h +++ b/include/cufinufft/impl.h @@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran using namespace cufinufft::common; int ier; if (type < 1 || type > 3) { - fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type); + fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type); return FINUFFT_ERR_TYPE_NOTVALID; } if (ntransf < 1) { @@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran } cufinufft_setup_binsize(type, d_plan->spopts.nspread, dim, &d_plan->opts); - if (ier = cudaGetLastError(), ier != cudaSuccess) { + if (cudaGetLastError() != cudaSuccess) { + ier = FINUFFT_ERR_CUDA_FAILURE; goto finalize; } if (d_plan->opts.debug) { @@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required); } + + // dynamically request the maximum amount of shared memory available + // for the spreader + + /* Automatically set GPU method. */ + if (d_plan->opts.gpu_method == 0) { + /* For type 1, we default to method 2 (SM) since this is generally faster + * if there is enough shared memory available. Otherwise, we default to GM. + * Type 3 inherits this behavior since the outer plan here is also a type 1. + * + * For type 2, we always default to method 1 (GM). + */ + if (type == 2) { + d_plan->opts.gpu_method = 1; + } else { + // query the device for the amount of shared memory available + int shared_mem_per_block{}; + cudaDeviceGetAttribute(&shared_mem_per_block, + cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); + // compute the amount of shared memory required for the method + const auto shared_mem_required = shared_memory_required( + dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, + d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); + if ((shared_mem_required > shared_mem_per_block)) { + d_plan->opts.gpu_method = 1; + } else { + d_plan->opts.gpu_method = 2; + } + } + } + + if (cudaGetLastError() != cudaSuccess) { + ier = FINUFFT_ERR_CUDA_FAILURE; + goto finalize; + } + if (type == 1 || type == 2) { CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1; set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1, @@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3, d_plan->opts.gpu_obinsizez); - // dynamically request the maximum amount of shared memory available - // for the spreader - - /* Automatically set GPU method. */ - if (d_plan->opts.gpu_method == 0) { - /* For type 1, we default to method 2 (SM) since this is generally faster - * if there is enough shared memory available. Otherwise, we default to GM. - * - * For type 2, we always default to method 1 (GM). - */ - if (type == 2) { - d_plan->opts.gpu_method = 1; - } else { - // query the device for the amount of shared memory available - int shared_mem_per_block{}; - cudaDeviceGetAttribute(&shared_mem_per_block, - cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id); - // compute the amount of shared memory required for the method - const auto shared_mem_required = shared_memory_required( - dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex, - d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez); - if ((shared_mem_required > shared_mem_per_block)) { - d_plan->opts.gpu_method = 1; - } else { - d_plan->opts.gpu_method = 2; - } - } - } - - if ((ier = cudaGetLastError())) { - goto finalize; - } - d_plan->nf1 = nf1; d_plan->nf2 = nf2; d_plan->nf3 = nf3; @@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_ int t2modes[] = {d_plan->nf1, d_plan->nf2, d_plan->nf3}; cufinufft_opts t2opts = d_plan->opts; t2opts.gpu_spreadinterponly = 0; - t2opts.gpu_method = 1; + t2opts.gpu_method = 0; // Safe to ignore the return value here? if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan); // check that maxbatchsize is correct diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt index 1cadb7569..d8af7918a 100644 --- a/test/cuda/CMakeLists.txt +++ b/test/cuda/CMakeLists.txt @@ -21,6 +21,10 @@ foreach(srcfile ${test_src}) endforeach() function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) + add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + + add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} + ${UPSAMP}) + add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP} COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) @@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP) ${PREC} ${UPSAMP}) endif() + add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL} ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP} + COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL} + ${PREC} ${UPSAMP}) + add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP} COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100 ${PREC} ${UPSAMP})