Merge pull request #576 from janden/auto_method_type3

Automatically set method for GPU type 3
flatironinstitute · Oct 17, 2024 · efae920 · efae920
2 parents 4aae46f + f7af341
commit efae920
Show file tree

Hide file tree

Showing 2 changed files with 77 additions and 36 deletions.
diff --git a/include/cufinufft/impl.h b/include/cufinufft/impl.h
@@ -74,7 +74,7 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   using namespace cufinufft::common;
   int ier;
   if (type < 1 || type > 3) {
-    fprintf(stderr, "[%s] Invalid type (%d): should be 1 or 2.\n", __func__, type);
+    fprintf(stderr, "[%s] Invalid type (%d): should be 1, 2, or 3.\n", __func__, type);
     return FINUFFT_ERR_TYPE_NOTVALID;
   }
   if (ntransf < 1) {
@@ -178,7 +178,8 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
   }
 
   cufinufft_setup_binsize<T>(type, d_plan->spopts.nspread, dim, &d_plan->opts);
-  if (ier = cudaGetLastError(), ier != cudaSuccess) {
+  if (cudaGetLastError() != cudaSuccess) {
+    ier = FINUFFT_ERR_CUDA_FAILURE;
     goto finalize;
   }
   if (d_plan->opts.debug) {
@@ -196,6 +197,42 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
     printf("[cufinufft] shared memory required for the spreader: %ld\n", mem_required);
   }
 
+
+  // dynamically request the maximum amount of shared memory available
+  // for the spreader
+
+  /* Automatically set GPU method. */
+  if (d_plan->opts.gpu_method == 0) {
+    /* For type 1, we default to method 2 (SM) since this is generally faster
+     * if there is enough shared memory available. Otherwise, we default to GM.
+     * Type 3 inherits this behavior since the outer plan here is also a type 1.
+     *
+     * For type 2, we always default to method 1 (GM).
+     */
+    if (type == 2) {
+      d_plan->opts.gpu_method = 1;
+    } else {
+      // query the device for the amount of shared memory available
+      int shared_mem_per_block{};
+      cudaDeviceGetAttribute(&shared_mem_per_block,
+                             cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
+      // compute the amount of shared memory required for the method
+      const auto shared_mem_required = shared_memory_required<T>(
+          dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
+          d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
+      if ((shared_mem_required > shared_mem_per_block)) {
+        d_plan->opts.gpu_method = 1;
+      } else {
+        d_plan->opts.gpu_method = 2;
+      }
+    }
+  }
+
+  if (cudaGetLastError() != cudaSuccess) {
+    ier = FINUFFT_ERR_CUDA_FAILURE;
+    goto finalize;
+  }
+
   if (type == 1 || type == 2) {
     CUFINUFFT_BIGINT nf1 = 1, nf2 = 1, nf3 = 1;
     set_nf_type12(d_plan->ms, d_plan->opts, d_plan->spopts, &nf1,
@@ -207,39 +244,6 @@ int cufinufft_makeplan_impl(int type, int dim, int *nmodes, int iflag, int ntran
       set_nf_type12(d_plan->mu, d_plan->opts, d_plan->spopts, &nf3,
                     d_plan->opts.gpu_obinsizez);
 
-    // dynamically request the maximum amount of shared memory available
-    // for the spreader
-
-    /* Automatically set GPU method. */
-    if (d_plan->opts.gpu_method == 0) {
-      /* For type 1, we default to method 2 (SM) since this is generally faster
-       * if there is enough shared memory available. Otherwise, we default to GM.
-       *
-       * For type 2, we always default to method 1 (GM).
-       */
-      if (type == 2) {
-        d_plan->opts.gpu_method = 1;
-      } else {
-        // query the device for the amount of shared memory available
-        int shared_mem_per_block{};
-        cudaDeviceGetAttribute(&shared_mem_per_block,
-                               cudaDevAttrMaxSharedMemoryPerBlockOptin, device_id);
-        // compute the amount of shared memory required for the method
-        const auto shared_mem_required = shared_memory_required<T>(
-            dim, d_plan->spopts.nspread, d_plan->opts.gpu_binsizex,
-            d_plan->opts.gpu_binsizey, d_plan->opts.gpu_binsizez);
-        if ((shared_mem_required > shared_mem_per_block)) {
-          d_plan->opts.gpu_method = 1;
-        } else {
-          d_plan->opts.gpu_method = 2;
-        }
-      }
-    }
-
-    if ((ier = cudaGetLastError())) {
-      goto finalize;
-    }
-
     d_plan->nf1 = nf1;
     d_plan->nf2 = nf2;
     d_plan->nf3 = nf3;
@@ -795,7 +799,7 @@ int cufinufft_setpts_impl(int M, T *d_kx, T *d_ky, T *d_kz, int N, T *d_s, T *d_
     int t2modes[]               = {d_plan->nf1, d_plan->nf2, d_plan->nf3};
     cufinufft_opts t2opts       = d_plan->opts;
     t2opts.gpu_spreadinterponly = 0;
-    t2opts.gpu_method           = 1;
+    t2opts.gpu_method           = 0;
     // Safe to ignore the return value here?
     if (d_plan->t2_plan) cufinufft_destroy_impl(d_plan->t2_plan);
     // check that maxbatchsize is correct

diff --git a/test/cuda/CMakeLists.txt b/test/cuda/CMakeLists.txt
@@ -21,6 +21,10 @@ foreach(srcfile ${test_src})
 endforeach()
 
 function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
+  add_test(NAME cufinufft1d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
@@ -29,13 +33,26 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft1d_test 2 1 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
 
+  add_test(NAME cufinufft1d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 2 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
+
+  add_test(NAME cufinufft1d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft1d_test 0 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
+                   ${UPSAMP})
+
   add_test(NAME cufinufft1d3_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft1d_test 1 3 1e2 2e2 ${REQ_TOL} ${CHECK_TOL} ${PREC}
                    ${UPSAMP})
 
+  add_test(NAME cufinufft2d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 1 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -44,10 +61,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2d_test 2 1 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d2_test_SM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 2 2 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft2d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft2d_test 0 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft2d3_test_SM_${PREC}_${UPSAMP}
            COMMAND cufinufft2d_test 2 3 1e2 2e2 2e4 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -76,6 +101,10 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
            COMMAND cufinufft2dmany_test 2 3 1e2 2e2 5 0 2e4 ${REQ_TOL}
                    ${CHECK_TOL} ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft3d1_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d1_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 1 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
@@ -98,10 +127,18 @@ function(add_tests PREC REQ_TOL CHECK_TOL UPSAMP)
                      ${PREC} ${UPSAMP})
   endif()
 
+  add_test(NAME cufinufft3d2_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d2_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 2 2 5 10 20 ${REQ_TOL} ${CHECK_TOL}
                    ${PREC} ${UPSAMP})
 
+  add_test(NAME cufinufft3d3_test_auto_${PREC}_${UPSAMP}
+           COMMAND cufinufft3d_test 0 3 2 5 10 30 ${REQ_TOL} ${CHECK_TOL}
+                   ${PREC} ${UPSAMP})
+
   add_test(NAME cufinufft3d3_test_GM_${PREC}_${UPSAMP}
            COMMAND cufinufft3d_test 1 3 2 3 7 20 ${REQ_TOL} ${CHECK_TOL}*100
                    ${PREC} ${UPSAMP})