Use opt-in shared memory carveout for FIL (rapidsai#3759)

To speed up inference on GPUs where extra shared memory can be opted in (and when this enables inference out of shared memory), take advantage. Authors: - Levs Dolgovs (https://github.com/levsnv) Approvers: - Andy Adinets (https://github.com/canonizer) - William Hicks (https://github.com/wphicks) URL: rapidsai#3759
vimarsh6739 · Nov 12, 2021 · 0d72c5a · 0d72c5a
1 parent 88bd7a6
commit 0d72c5a
Show file tree

Hide file tree

Showing 4 changed files with 63 additions and 9 deletions.
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
@@ -285,6 +285,13 @@ struct compute_smem_footprint : dispatch_functor<int> {
   int run(predict_params);
 };
 
+template <int NITEMS,
+          leaf_algo_t leaf_algo,
+          bool cols_in_shmem,
+          bool CATS_SUPPORTED,
+          class storage_type>
+__global__ void infer_k(storage_type forest, predict_params params);
+
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);

diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
@@ -76,24 +76,23 @@ extern template int dispatch_on_fil_template_params(compute_smem_footprint, pred
 struct forest {
   forest(const raft::handle_t& h) : vector_leaf_(0, h.get_stream()), cat_sets_(h.get_stream()) {}
 
-  void init_n_items(int device)
+  void init_shmem_size(int device)
   {
-    int max_shm_std = 48 * 1024;  // 48 KiB
     /// the most shared memory a kernel can request on the GPU in question
-    int max_shm = 0;
-    CUDA_CHECK(cudaDeviceGetAttribute(&max_shm, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
+    CUDA_CHECK(cudaDeviceGetAttribute(&max_shm_, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     /* Our GPUs have been growing the shared memory size generation after
        generation. Eventually, a CUDA GPU might come by that supports more
        shared memory that would fit into unsigned 16-bit int. For such a GPU,
        we would have otherwise silently overflowed the index calculation due
        to short division. It would have failed cpp tests, but we might forget
        about this source of bugs, if not for the failing assert. */
-    ASSERT(max_shm < 262144,
+    ASSERT(max_shm_ < int(sizeof(float)) * std::numeric_limits<uint16_t>::max(),
            "internal error: please use a larger type inside"
            " infer_k for column count");
-    // TODO(canonizer): use >48KiB shared memory if available
-    max_shm = std::min(max_shm, max_shm_std);
+  }
 
+  void init_n_items(int device)
+  {
     // searching for the most items per block while respecting the shared
     // memory limits creates a full linear programming problem.
     // solving it in a single equation looks less tractable than this
@@ -109,10 +108,10 @@ struct forest {
         ssp.cols_in_shmem = cols_in_shmem;
         for (ssp.n_items = min_n_items; ssp.n_items <= max_n_items; ++ssp.n_items) {
           ssp.shm_sz = dispatch_on_fil_template_params(compute_smem_footprint(), ssp);
-          if (ssp.shm_sz < max_shm) ssp_ = ssp;
+          if (ssp.shm_sz < max_shm_) ssp_ = ssp;
         }
       }
-      ASSERT(max_shm >= ssp_.shm_sz,
+      ASSERT(max_shm_ >= ssp_.shm_sz,
              "FIL out of shared memory. Perhaps the maximum number of \n"
              "supported classes is exceeded? 5'000 would still be safe.");
     }
@@ -149,6 +148,7 @@ struct forest {
 
     int device          = h.get_device();
     cudaStream_t stream = h.get_stream();
+    init_shmem_size(device);
     init_n_items(device);  // n_items takes priority over blocks_per_sm
     init_fixed_block_count(device, params->blocks_per_sm);
 
@@ -325,11 +325,31 @@ struct forest {
   float global_bias_ = 0;
   shmem_size_params class_ssp_, proba_ssp_;
   int fixed_block_count_ = 0;
+  int max_shm_           = 0;
   // Optionally used
   rmm::device_uvector<float> vector_leaf_;
   cat_sets_device_owner cat_sets_;
 };
 
+template <typename storage_type>
+struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
+  const int max_shm;
+  opt_into_arch_dependent_shmem(int max_shm_) : max_shm(max_shm_) {}
+
+  template <typename KernelParams = KernelTemplateParams<>>
+  void run(predict_params p)
+  {
+    auto kernel = infer_k<KernelParams::N_ITEMS,
+                          KernelParams::LEAF_ALGO,
+                          KernelParams::COLS_IN_SHMEM,
+                          KernelParams::CATS_SUPPORTED,
+                          storage_type>;
+    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
+    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
+    CUDA_CHECK(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+  }
+};
+
 struct dense_forest : forest {
   dense_forest(const raft::handle_t& h) : forest(h), nodes_(0, h.get_stream()) {}
 
@@ -381,6 +401,10 @@ struct dense_forest : forest {
                                num_nodes * sizeof(dense_node),
                                cudaMemcpyHostToDevice,
                                h.get_stream()));
+
+    // predict_proba is a runtime parameter, and opt-in is unconditional
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<dense_storage>(max_shm_),
+                                    static_cast<predict_params>(class_ssp_));
     // copy must be finished before freeing the host data
     CUDA_CHECK(cudaStreamSynchronize(h.get_stream()));
     h_nodes_.clear();
@@ -436,6 +460,10 @@ struct sparse_forest : forest {
     nodes_.resize(num_nodes_, h.get_stream());
     CUDA_CHECK(cudaMemcpyAsync(
       nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream()));
+
+    // predict_proba is a runtime parameter, and opt-in is unconditional
+    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<sparse_storage<node_t>>(max_shm_),
+                                    static_cast<predict_params>(class_ssp_));
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override

diff --git a/cpp/src/fil/internal.cuh b/cpp/src/fil/internal.cuh
@@ -520,7 +520,11 @@ void init_sparse(const raft::handle_t& h,
                  const fil_node_t* nodes,
                  const forest_params_t* params);
 
+struct predict_params;
+
 }  // namespace fil
 
+static const int MAX_SHM_STD = 48 * 1024;  // maximum architecture-independent size
+
 std::string output2str(fil::output_t output);
 }  // namespace ML
diff --git a/cpp/test/sg/fil_test.cu b/cpp/test/sg/fil_test.cu
@@ -1040,6 +1040,21 @@ std::vector<FilTestParams> predict_dense_inputs = {
                   algo        = BATCH_TREE_REORG,
                   leaf_algo   = CATEGORICAL_LEAF,
                   num_classes = 3),
+  // use shared memory opt-in carveout if available, or infer out of L1 cache
+  FIL_TEST_PARAMS(num_rows = 103, num_cols = MAX_SHM_STD / sizeof(float) + 1024, algo = NAIVE),
+  FIL_TEST_PARAMS(num_rows    = 103,
+                  num_cols    = MAX_SHM_STD / sizeof(float) + 1024,
+                  leaf_algo   = GROVE_PER_CLASS,
+                  num_classes = 5),
+  FIL_TEST_PARAMS(num_rows    = 103,
+                  num_cols    = MAX_SHM_STD / sizeof(float) + 1024,
+                  num_trees   = FIL_TPB + 1,
+                  leaf_algo   = GROVE_PER_CLASS,
+                  num_classes = FIL_TPB + 1),
+  FIL_TEST_PARAMS(num_rows    = 103,
+                  num_cols    = MAX_SHM_STD / sizeof(float) + 1024,
+                  leaf_algo   = CATEGORICAL_LEAF,
+                  num_classes = 3),
   FIL_TEST_PARAMS(algo = BATCH_TREE_REORG, threads_per_tree = 2),
   FIL_TEST_PARAMS(algo = NAIVE, threads_per_tree = 4),
   FIL_TEST_PARAMS(algo = TREE_REORG, threads_per_tree = 8),