From 93774eb19ced0ecbce838d245e9dfb0ee592b478 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 23 Oct 2020 15:05:26 -0700
Subject: [PATCH 01/13] hide inheritance structure and clean up comments

---
 cpp/src/fil/infer.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 5c31b21585..e11d4144ac 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -89,8 +89,7 @@ __device__ __forceinline__ vec<NITEMS, output_type> infer_one_tree(
 #pragma unroll
   for (int j = 0; j < NITEMS; ++j) {
     /** dependent names are not considered templates by default,
-        unless it's a member of a current [template] instantiation.
-        alternatively, could have used .base_node::output<... */
+        unless it's a member of a current [template] instantiation.**/
     out[j] = tree[curr[j]].template output<output_type>();
   }
   return out;
@@ -109,7 +108,9 @@ __device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
     curr = n.left(curr) + cond;
   }
   vec<1, output_type> out;
-  out[0] = tree[curr].base_node::output<output_type>();
+  /** dependent names are not considered templates by default,
+      unless it's a member of a current [template] instantiation.**/
+  out[0] = tree[curr[j]].template output<output_type>();
   return out;
 }
 

From 73a43db9e5ae003615ca140c1c6f2d6d633ae2b2 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 11 Nov 2020 21:45:11 -0800
Subject: [PATCH 02/13] clearer __syncthreads() purpose

---
 cpp/src/fil/infer.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index e11d4144ac..84bd1a2708 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -178,7 +178,7 @@ struct tree_aggregator_t {
 
   __device__ __forceinline__ void finalize(float* out, int num_rows,
                                            int output_stride) {
-    __syncthreads();
+    __syncthreads();  // free up tmp_storage from input rows
     typedef typename BlockReduce<NITEMS>::TempStorage TempStorage;
     acc = BlockReduce<NITEMS>(*(TempStorage*)tmp_storage).Sum(acc);
     if (threadIdx.x == 0) {
@@ -349,7 +349,6 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   // is just the number of outputs for each data instance
   __device__ __forceinline__ void finalize_multiple_outputs(float* out,
                                                             int num_rows) {
-    __syncthreads();
     int item = threadIdx.x;
     int row = blockIdx.x * NITEMS + item;
     if (item < NITEMS && row < num_rows) {
@@ -362,7 +361,6 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   // or class probabilities or regression
   __device__ __forceinline__ void finalize_class_label(float* out,
                                                        int num_rows) {
-    __syncthreads();
     int item = threadIdx.x;
     int row = blockIdx.x * NITEMS + item;
     if (item < NITEMS && row < num_rows) {
@@ -379,6 +377,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   }
   __device__ __forceinline__ void finalize(float* out, int num_rows,
                                            int num_outputs) {
+    __syncthreads();  // make sure all votes[] are final
     if (num_outputs > 1) {
       // only supporting num_outputs == num_classes
       finalize_multiple_outputs(out, num_rows);

From da8529ff8eee92d41497cde017a879c318f8faa4 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 13 Nov 2020 18:32:54 -0800
Subject: [PATCH 03/13] simplify treelite.Model handle extraction

---
 python/cuml/fil/fil.pyx | 36 +++---------------------------------
 1 file changed, 3 insertions(+), 33 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index c8c9b8e994..8df3ed70d5 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -310,44 +310,14 @@ cdef class ForestInference_impl():
             output_dtype=output_dtype
         )
 
-    def load_from_treelite_model_handle(self,
-                                        uintptr_t model_handle,
-                                        bool output_class,
-                                        str algo,
-                                        float threshold,
-                                        str storage_type):
-        cdef treelite_params_t treelite_params
-
-        self.output_class = output_class
-        treelite_params.output_class = self.output_class
-        treelite_params.threshold = threshold
-        treelite_params.algo = self.get_algo(algo)
-        treelite_params.storage_type = self.get_storage_type(storage_type)
-
-        self.forest_data = NULL
-        cdef handle_t* handle_ =\
-            <handle_t*><size_t>self.handle.getHandle()
-        cdef uintptr_t model_ptr = <uintptr_t>model_handle
-
-        from_treelite(handle_[0],
-                      &self.forest_data,
-                      <ModelHandle> model_ptr,
-                      &treelite_params)
-        TreeliteQueryNumOutputGroups(<ModelHandle> model_ptr,
-                                     & self.num_output_groups)
-        return self
-
     def load_from_treelite_model(self,
                                  TreeliteModel model,
                                  bool output_class,
                                  str algo,
                                  float threshold,
                                  str storage_type):
-        TreeliteQueryNumOutputGroups(<ModelHandle> model.handle,
-                                     & self.num_output_groups)
-        return self.load_from_treelite_model_handle(<uintptr_t>model.handle,
-                                                    output_class, algo,
-                                                    threshold, storage_type)
+        return self.load_using_treelite_handle(model.handle, output_class,
+                                               algo, threshold, storage_type)
 
     def load_using_treelite_handle(self,
                                    model_handle,
@@ -582,7 +552,7 @@ class ForestInference(Base):
                 model, output_class, algo, threshold, str(storage_type))
         else:
             # assume it is treelite.Model
-            return self._impl.load_from_treelite_model_handle(
+            return self._impl.load_using_treelite_handle(
                 model.handle.value, output_class, algo, threshold,
                 str(storage_type))
 

From 1e00e64acc76ebe283ca9a7ccc1a1664db87b3b1 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 13 Nov 2020 18:51:41 -0800
Subject: [PATCH 04/13] removed deprecated "silent" from xgboost params;
 repeated code

---
 python/cuml/test/test_fil.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/python/cuml/test/test_fil.py b/python/cuml/test/test_fil.py
index 5fb968b10a..224c3ef688 100644
--- a/python/cuml/test/test_fil.py
+++ b/python/cuml/test/test_fil.py
@@ -64,22 +64,19 @@ def _build_and_save_xgboost(model_path,
     dtrain = xgb.DMatrix(X_train, label=y_train)
 
     # instantiate params
-    params = {'silent': 1}
+    params = {'eval_metric': 'error', 'max_depth': 25}
 
     # learning task params
     if classification:
-        params['eval_metric'] = 'error'
         if n_classes == 2:
             params['objective'] = 'binary:logistic'
         else:
             params['num_class'] = n_classes
             params['objective'] = 'multi:softmax'
     else:
-        params['eval_metric'] = 'error'
         params['objective'] = 'reg:squarederror'
         params['base_score'] = 0.0
 
-    params['max_depth'] = 25
     params.update(xgboost_params)
     bst = xgb.train(params, dtrain, num_rounds)
     bst.save_model(model_path)

From 9e7463c980f6b38272f7d4dbf030f937a414ebba Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 13 Nov 2020 20:11:53 -0800
Subject: [PATCH 05/13] moved all_set closer to where it gets affected

---
 cpp/include/cuml/fil/fil.h | 1 +
 cpp/src/fil/fil.cu         | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 01415c946d..921661e940 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -74,6 +74,7 @@ enum output_t {
   SIGMOID_CLASS = SIGMOID | CLASS,
   AVG_CLASS = AVG | CLASS,
   AVG_SIGMOID_CLASS = AVG | SIGMOID | CLASS,
+  all_set = AVG | SIGMOID | CLASS
 };
 
 /** storage_type_t defines whether to import the forests as dense or sparse */
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index 6a43bb1f1c..f2b7991a1b 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -437,9 +437,7 @@ void check_params(const forest_params_t* params, bool dense) {
              " or GROVE_PER_CLASS");
   }
   // output_t::RAW == 0, and doesn't have a separate flag
-  output_t all_set =
-    output_t(output_t::AVG | output_t::SIGMOID | output_t::CLASS);
-  if ((params->output & ~all_set) != 0) {
+  if ((params->output & ~output_t::all_set) != 0) {
     ASSERT(false,
            "output should be a combination of RAW, AVG, SIGMOID and CLASS");
   }

From aa002fcf06f4cfacc26a7874bd006b048b9a933a Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 13 Nov 2020 22:56:56 -0800
Subject: [PATCH 06/13] fixed old bug

---
 cpp/src/fil/infer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 84bd1a2708..03dc2512c9 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -110,7 +110,7 @@ __device__ __forceinline__ vec<1, output_type> infer_one_tree(tree_type tree,
   vec<1, output_type> out;
   /** dependent names are not considered templates by default,
       unless it's a member of a current [template] instantiation.**/
-  out[0] = tree[curr[j]].template output<output_type>();
+  out[0] = tree[curr].template output<output_type>();
   return out;
 }
 

From 1db94e587366138cf71a9a99f5d0f70326018e1d Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 20 Nov 2020 17:15:04 -0800
Subject: [PATCH 07/13] Revert "simplify treelite.Model handle extraction"

This reverts commit da8529ff8eee92d41497cde017a879c318f8faa4.
---
 python/cuml/fil/fil.pyx | 36 +++++++++++++++++++++++++++++++++---
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 8df3ed70d5..c8c9b8e994 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -310,14 +310,44 @@ cdef class ForestInference_impl():
             output_dtype=output_dtype
         )
 
+    def load_from_treelite_model_handle(self,
+                                        uintptr_t model_handle,
+                                        bool output_class,
+                                        str algo,
+                                        float threshold,
+                                        str storage_type):
+        cdef treelite_params_t treelite_params
+
+        self.output_class = output_class
+        treelite_params.output_class = self.output_class
+        treelite_params.threshold = threshold
+        treelite_params.algo = self.get_algo(algo)
+        treelite_params.storage_type = self.get_storage_type(storage_type)
+
+        self.forest_data = NULL
+        cdef handle_t* handle_ =\
+            <handle_t*><size_t>self.handle.getHandle()
+        cdef uintptr_t model_ptr = <uintptr_t>model_handle
+
+        from_treelite(handle_[0],
+                      &self.forest_data,
+                      <ModelHandle> model_ptr,
+                      &treelite_params)
+        TreeliteQueryNumOutputGroups(<ModelHandle> model_ptr,
+                                     & self.num_output_groups)
+        return self
+
     def load_from_treelite_model(self,
                                  TreeliteModel model,
                                  bool output_class,
                                  str algo,
                                  float threshold,
                                  str storage_type):
-        return self.load_using_treelite_handle(model.handle, output_class,
-                                               algo, threshold, storage_type)
+        TreeliteQueryNumOutputGroups(<ModelHandle> model.handle,
+                                     & self.num_output_groups)
+        return self.load_from_treelite_model_handle(<uintptr_t>model.handle,
+                                                    output_class, algo,
+                                                    threshold, storage_type)
 
     def load_using_treelite_handle(self,
                                    model_handle,
@@ -552,7 +582,7 @@ class ForestInference(Base):
                 model, output_class, algo, threshold, str(storage_type))
         else:
             # assume it is treelite.Model
-            return self._impl.load_using_treelite_handle(
+            return self._impl.load_from_treelite_model_handle(
                 model.handle.value, output_class, algo, threshold,
                 str(storage_type))
 

From 008fd55d2a1c2d274cbbccc391504fdf8d540f09 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 4 Jun 2021 18:27:15 -0700
Subject: [PATCH 08/13] update documentation

---
 python/cuml/fil/README.md |   8 +-
 python/cuml/fil/fil.pyx   | 220 ++++++++++++--------------------------
 2 files changed, 71 insertions(+), 157 deletions(-)

diff --git a/python/cuml/fil/README.md b/python/cuml/fil/README.md
index d3c26544e7..e1e2cad97a 100644
--- a/python/cuml/fil/README.md
+++ b/python/cuml/fil/README.md
@@ -33,7 +33,7 @@ Additionally, FIL can be called directly from C or C++ code. See [the API docs h
 # Features
 
 * Input model source: XGBoost (binary format), cuML RandomForest, scikit-learn RandomForest, LightGBM
-* Model types: Regression, Binary Classification, Multi-class Classification (for cuML Random Forests, but not GBDTs or scikit-learn Random Forests)
+* Model types: Regression, Binary Classification, Multi-class Classification (for cuML Random Forests or GBDTs, but not scikit-learn Random Forests)
 * Tree storage types: Dense or sparse tree storage (see Sparse Forests with FIL blog below)
 * Input formats: Dense, row-major, FP32 arrays on GPU or CPU (e.g. NumPy, cuPy, or other data formats supported by cuML). Trees are expected to be trained for float32 inputs. There may be rounding differences if trees were trained for float64 inputs.
 * High performance batch inference
@@ -41,9 +41,10 @@ Additionally, FIL can be called directly from C or C++ code. See [the API docs h
 
 Upcoming features:
 
-* Support for multi-class GBDTs is planned for RAPIDS 0.16
+* Support for multi-class random forests from scikit-learn
 * Support for smaller node storage (8-byte) to reduce memory usage for
   small trees is experimental
+* Categorical features for LightGBM models
 
 # Benchmarks and performance notes
 
@@ -74,5 +75,6 @@ GPU, using FIL 0.9.)
 * [RAPIDS Forest Inference Library: Prediction at 100 million rows per second](https://medium.com/rapids-ai/rapids-forest-inference-library-prediction-at-100-million-rows-per-second-19558890bc35)
 * [Sparse Forests with FIL](https://medium.com/rapids-ai/sparse-forests-with-fil-ffbb42b0c7e3
 )
-* [GBM Inferencing on GPU (earlier research work)](https://on-demand.gputechconf.com/gtc/2018/presentation/s8873-gbm-inferencing-on-gpu-v2.pdf)
+* [GBM Inferencing on GPU, 2018 talk (earlier research work)](https://on-demand.gputechconf.com/gtc/2018/presentation/s8873-gbm-inferencing-on-gpu-v2.pdf)
 * [Sample Notebook](https://github.com/rapidsai/cuml/blob/branch-0.16/notebooks/forest_inference_demo.ipynb)
+* [GTC 2021 talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31296/)
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 5ba5b452ad..af255c1cde 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -401,7 +401,7 @@ class ForestInference(Base,
 
     **Known limitations**:
      * A single row of data should fit into the shared memory of a thread
-       block, which means that more than 12288 features are not supported.
+       block, which means that more than 5000-12288 features will infer from L1.
      * From sklearn.ensemble, only
        {RandomForest,GradientBoosting,ExtraTrees}{Classifier,Regressor} models
        are supported. Other sklearn.ensemble models are currently not
@@ -466,6 +466,59 @@ class ForestInference(Base,
 
     """
 
+    common_load_params_docstring = """
+    output_class: boolean (default=False)
+        For a Classification model `output_class` must be True.
+        For a Regression model `output_class` must be False.
+    algo : string (default='auto')
+        Name of the algo from (from algo_t enum):
+
+         - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
+           Currently 'BATCH_TREE_REORG' is used for dense storage,
+           and 'NAIVE' for sparse storage
+         - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
+         - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
+           rearranged to be more coalescing-friendly
+         - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
+           TREE_REORG but predicting multiple rows per thread block
+
+    threshold : float (default=0.5)
+        Threshold is used to for classification. It is applied
+        only if ``output_class == True``, else it is ignored.
+    storage_type : string or boolean (default='auto')
+        In-memory storage format to be used for the FIL model:
+
+         - ``'auto'``: Choose the storage type automatically
+           (currently DENSE is always used)
+         - ``False``: Create a dense forest
+         - ``True``: Create a sparse forest. Requires algo='NAIVE' or
+           algo='AUTO'
+
+    blocks_per_sm : integer (default=0)
+        (experimental) Indicates how the number of thread blocks to lauch
+        for the inference kernel is determined.
+
+        - ``0`` (default): Launches the number of blocks proportional to
+          the number of data rows
+        - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
+          will fail if blocks_per_sm blocks result in more threads than the
+          maximum supported number of threads per GPU. Even if successful,
+          it is not guaranteed that blocks_per_sm blocks will run on an SM
+          concurrently.
+    compute_shape_str : boolean (default=False)
+        if True or equivalent, creates a ForestInference.shape_str
+        (writes a human-readable forest shape description as a
+        multiline ascii string)
+    """
+
+    common_predict_params_docstring = """
+    X : array-like (device or host) shape = (n_samples, n_features)
+       Dense matrix (floats) of shape (n_samples, n_features).
+       Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
+       ndarray, cuda array interface compliant array like CuPy
+       For optimal performance, pass a device array with C-style layout
+    """
+
     def __init__(self, *,
                  handle=None,
                  output_type=None,
@@ -486,11 +539,8 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        X : array-like (device or host) shape = (n_samples, n_features)
-           Dense matrix (floats) of shape (n_samples, n_features).
-           Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
-           ndarray, cuda array interface compliant array like CuPy
-           For optimal performance, pass a device array with C-style layout
+        """ + common_predict_params_docstring
+        """
         preds: gpuarray or cudf.Series, shape = (n_samples,)
            Optional 'out' location to store inference results
 
@@ -509,11 +559,8 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        X : array-like (device or host) shape = (n_samples, n_features)
-           Dense matrix (floats) of shape (n_samples, n_features).
-           Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
-           ndarray, cuda array interface compliant array like CuPy
-           For optimal performance, pass a device array with C-style layout
+        """ + common_predict_params_docstring
+        """
         preds: gpuarray or cudf.Series, shape = (n_samples,2)
            Binary probability output
            Optional 'out' location to store inference results
@@ -541,53 +588,8 @@ class ForestInference(Base,
             the trained model information in the treelite format
             loaded from a saved model using the treelite API
             https://treelite.readthedocs.io/en/latest/treelite-api.html
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        algo : string (default='auto')
-            Name of the algo from (from algo_t enum):
-
-             - ``'AUTO'`` or ``'auto'``: choose the algorithm automatically.
-               Currently 'BATCH_TREE_REORG' is used for dense storage,
-               and 'NAIVE' for sparse storage
-             - ``'NAIVE'`` or ``'naive'``: simple inference using shared memory
-             - ``'TREE_REORG'`` or ``'tree_reorg'``: similar to naive but trees
-               rearranged to be more coalescing-friendly
-             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: similar to
-               TREE_REORG but predicting multiple rows per thread block
-
-        threshold : float (default=0.5)
-            Threshold is used to for classification. It is applied
-            only if ``output_class == True``, else it is ignored.
-        storage_type : string or boolean (default='auto')
-            In-memory storage format to be used for the FIL model:
-
-             - ``'auto'``: Choose the storage type automatically
-               (currently DENSE is always used)
-             - ``False``: Create a dense forest
-             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
-               algo='AUTO'
-             - ``'sparse8'``: (experimental) Create a sparse forest with 8-byte
-               nodes. Requires algo='NAIVE' or algo='AUTO'. Can fail if 8-byte
-               nodes are not enough to store the forest, e.g. if there are too
-               many nodes in a tree or too many features
-
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
-
+        """ + common_load_params_docstring
+        """
         Returns
         ----------
         fil_model
@@ -622,48 +624,8 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        algo : string (default='auto')
-            Name of the algo from (from algo_t enum):
-
-             - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
-               Currently 'BATCH_TREE_REORG' is used for dense storage,
-               and 'NAIVE' for sparse storage
-             - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
-             - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
-               rearranged to be more coalescing-friendly
-             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
-               TREE_REORG but predicting multiple rows per thread block
-
-        threshold : float (default=0.5)
-            Threshold is used to for classification. It is applied
-            only if ``output_class == True``, else it is ignored.
-        storage_type : string or boolean (default='auto')
-            In-memory storage format to be used for the FIL model:
-
-             - ``'auto'``: Choose the storage type automatically
-               (currently DENSE is always used)
-             - ``False``: Create a dense forest
-             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
-               algo='AUTO'
-
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + common_load_params_docstring
+        """
 
         Returns
         ----------
@@ -702,33 +664,8 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        threshold : float (default=0.5)
-            Cutoff value above which a prediction is set to 1.0
-            Only used if the model is classification and `output_class` is True
-        algo : string (default='auto')
-            Which inference algorithm to use.
-            See documentation in `FIL.load_from_treelite_model`
-        storage_type : string (default='auto')
-            In-memory storage format to be used for the FIL model.
-            See documentation in `FIL.load_from_treelite_model`
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + common_load_params_docstring
+        """
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
             It can be 'xgboost', 'xgboost_json', 'lightgbm'.
@@ -765,33 +702,8 @@ class ForestInference(Base,
         model_handle : Modelhandle to the treelite forest model
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        threshold : float (default=0.5)
-            Cutoff value above which a prediction is set to 1.0
-            Only used if the model is classification and `output_class` is True
-        algo : string (default='auto')
-            Which inference algorithm to use.
-            See documentation in `FIL.load_from_treelite_model`
-        storage_type : string (default='auto')
-            In-memory storage format to be used for the FIL model.
-            See documentation in `FIL.load_from_treelite_model`
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + common_load_params_docstring
+        """
 
         Returns
         ----------

From 48d868b1e48016fa92817bcd352936d3d8d73425 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 4 Jun 2021 20:43:46 -0700
Subject: [PATCH 09/13] delayed merge error fix

---
 cpp/include/cuml/fil/fil.h | 31 -------------------------------
 cpp/src/fil/infer.cu       |  1 +
 python/cuml/fil/fil.pyx    | 14 +++++++-------
 3 files changed, 8 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cuml/fil/fil.h b/cpp/include/cuml/fil/fil.h
index 4d8193e347..3cdaadc1f5 100644
--- a/cpp/include/cuml/fil/fil.h
+++ b/cpp/include/cuml/fil/fil.h
@@ -50,37 +50,6 @@ enum algo_t {
   BATCH_TREE_REORG
 };
 
-/**
- * output_t are flags that define the output produced by the FIL predictor; a
- * valid output_t values consists of the following, combined using '|' (bitwise
- * or), which define stages, which operation in the next stage applied to the
- * output of the previous stage:
- * - one of RAW or AVG, indicating how to combine individual tree outputs into the forest output
- * - optional SIGMOID for applying the sigmoid transform
- * - optional CLASS, to output the class label
- */
-enum output_t {
-  /** raw output: the sum of the tree outputs; use for GBM models for
-      regression, or for binary classification for the value before the
-      transformation; note that this value is 0, and may be omitted
-      when combined with other flags */
-  RAW = 0x0,
-  /** average output: divide the sum of the tree outputs by the number of trees
-      before further transformations; use for random forests for regression
-      and binary classification for the probability */
-  AVG = 0x1,
-  /** sigmoid transformation: apply 1/(1+exp(-x)) to the sum or average of tree
-      outputs; use for GBM binary classification models for probability */
-  SIGMOID = 0x10,
-  /** output class label: either apply threshold to the output of the previous stage (for binary classification),
-      or select the class with the most votes to get the class label (for multi-class classification).  */
-  CLASS = 0x100,
-  SIGMOID_CLASS = SIGMOID | CLASS,
-  AVG_CLASS = AVG | CLASS,
-  AVG_SIGMOID_CLASS = AVG | SIGMOID | CLASS,
-  all_set = AVG | SIGMOID | CLASS
-};
-
 /** storage_type_t defines whether to import the forests as dense or sparse */
 enum storage_type_t {
   /** decide automatically; currently always builds dense forests */
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 7105fff5fc..4007287d42 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -501,6 +501,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   // or class probabilities or regression
   __device__ __forceinline__ void finalize_class_label(float* out,
                                                        int num_rows) {
+    __syncthreads();  // make sure all votes[] are final
     int item = threadIdx.x;
     int row = item;
     if (item < NITEMS && row < num_rows) {
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index af255c1cde..328d9cc70e 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -401,7 +401,7 @@ class ForestInference(Base,
 
     **Known limitations**:
      * A single row of data should fit into the shared memory of a thread
-       block, which means that more than 5000-12288 features will infer from L1.
+       block, which means that more than 5000-12288 features will infer from L1
      * From sklearn.ensemble, only
        {RandomForest,GradientBoosting,ExtraTrees}{Classifier,Regressor} models
        are supported. Other sklearn.ensemble models are currently not
@@ -539,7 +539,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + common_predict_params_docstring
+        """ + common_predict_params_docstring +
         """
         preds: gpuarray or cudf.Series, shape = (n_samples,)
            Optional 'out' location to store inference results
@@ -559,7 +559,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + common_predict_params_docstring
+        """ + common_predict_params_docstring +
         """
         preds: gpuarray or cudf.Series, shape = (n_samples,2)
            Binary probability output
@@ -588,7 +588,7 @@ class ForestInference(Base,
             the trained model information in the treelite format
             loaded from a saved model using the treelite API
             https://treelite.readthedocs.io/en/latest/treelite-api.html
-        """ + common_load_params_docstring
+        """ + common_load_params_docstring +
         """
         Returns
         ----------
@@ -624,7 +624,7 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-        """ + common_load_params_docstring
+        """ + common_load_params_docstring +
         """
 
         Returns
@@ -664,7 +664,7 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + common_load_params_docstring
+        """ + common_load_params_docstring +
         """
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
@@ -702,7 +702,7 @@ class ForestInference(Base,
         model_handle : Modelhandle to the treelite forest model
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + common_load_params_docstring
+        """ + common_load_params_docstring +
         """
 
         Returns

From 7dca8a32f3741c4f622d33115d6a150dfb239d42 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Fri, 4 Jun 2021 23:18:52 -0700
Subject: [PATCH 10/13] python newlines

---
 python/cuml/fil/fil.pyx | 18 ++++++------------
 1 file changed, 6 insertions(+), 12 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 328d9cc70e..3825b726e7 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -539,8 +539,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + common_predict_params_docstring +
-        """
+        """ + ForestInference.common_predict_params_docstring + """
         preds: gpuarray or cudf.Series, shape = (n_samples,)
            Optional 'out' location to store inference results
 
@@ -559,8 +558,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + common_predict_params_docstring +
-        """
+        """ + ForestInference.common_predict_params_docstring + """
         preds: gpuarray or cudf.Series, shape = (n_samples,2)
            Binary probability output
            Optional 'out' location to store inference results
@@ -588,8 +586,7 @@ class ForestInference(Base,
             the trained model information in the treelite format
             loaded from a saved model using the treelite API
             https://treelite.readthedocs.io/en/latest/treelite-api.html
-        """ + common_load_params_docstring +
-        """
+        """ + ForestInference.common_load_params_docstring + """
         Returns
         ----------
         fil_model
@@ -624,8 +621,7 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-        """ + common_load_params_docstring +
-        """
+        """ + ForestInference.common_load_params_docstring + """
 
         Returns
         ----------
@@ -664,8 +660,7 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + common_load_params_docstring +
-        """
+        """ + ForestInference.common_load_params_docstring + """
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
             It can be 'xgboost', 'xgboost_json', 'lightgbm'.
@@ -702,8 +697,7 @@ class ForestInference(Base,
         model_handle : Modelhandle to the treelite forest model
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + common_load_params_docstring +
-        """
+        """ + ForestInference.common_load_params_docstring + """
 
         Returns
         ----------

From f4fb48e222e9da99ada3f821fb1825c9b44dccee Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Tue, 22 Jun 2021 02:05:04 -0700
Subject: [PATCH 11/13] fixed all but static method docstring modifications

---
 python/cuml/fil/fil.pyx | 33 ++++++++++++++++++++++-----------
 1 file changed, 22 insertions(+), 11 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 62562af04b..05df7b41cb 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -23,6 +23,7 @@ import math
 import numpy as np
 import warnings
 import pandas as pd
+from inspect import getdoc
 
 import rmm
 
@@ -478,7 +479,8 @@ class ForestInference(Base,
 
     """
 
-    common_load_params_docstring = """
+    def common_load_params_docstring(func):
+        func.__doc__ = getdoc(func).format("""
     output_class: boolean (default=False)
         For a Classification model `output_class` must be True.
         For a Regression model `output_class` must be False.
@@ -521,15 +523,18 @@ class ForestInference(Base,
         if True or equivalent, creates a ForestInference.shape_str
         (writes a human-readable forest shape description as a
         multiline ascii string)
-    """
+    """)
+        return func
 
-    common_predict_params_docstring = """
+    def common_predict_params_docstring(func):
+        func.__doc__ = getdoc(func).format("""
     X : array-like (device or host) shape = (n_samples, n_features)
        Dense matrix (floats) of shape (n_samples, n_features).
        Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
        ndarray, cuda array interface compliant array like CuPy
        For optimal performance, pass a device array with C-style layout
-    """
+    """)
+        return func
 
     def __init__(self, *,
                  handle=None,
@@ -540,6 +545,7 @@ class ForestInference(Base,
                          output_type=output_type)
         self._impl = ForestInference_impl(self.handle)
 
+    @common_predict_params_docstring
     def predict(self, X, preds=None) -> CumlArray:
         """
         Predicts the labels for X with the loaded forest model.
@@ -551,8 +557,8 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + ForestInference.common_predict_params_docstring + """
-        preds: gpuarray or cudf.Series, shape = (n_samples,)
+    {}
+        preds : gpuarray or cudf.Series, shape = (n_samples,)
            Optional 'out' location to store inference results
 
         Returns
@@ -562,6 +568,7 @@ class ForestInference(Base,
         """
         return self._impl.predict(X, predict_proba=False, preds=None)
 
+    @common_predict_params_docstring
     def predict_proba(self, X, preds=None) -> CumlArray:
         """
         Predicts the class probabilities for X with the loaded forest model.
@@ -570,7 +577,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        """ + ForestInference.common_predict_params_docstring + """
+    {}
         preds: gpuarray or cudf.Series, shape = (n_samples,2)
            Binary probability output
            Optional 'out' location to store inference results
@@ -582,6 +589,7 @@ class ForestInference(Base,
         """
         return self._impl.predict(X, predict_proba=True, preds=None)
 
+    @common_load_params_docstring
     def load_from_treelite_model(self, model, output_class=False,
                                  algo='auto',
                                  threshold=0.5,
@@ -600,7 +608,7 @@ class ForestInference(Base,
             the trained model information in the treelite format
             loaded from a saved model using the treelite API
             https://treelite.readthedocs.io/en/latest/treelite-api.html
-        """ + ForestInference.common_load_params_docstring + """
+    {}
         Returns
         ----------
         fil_model
@@ -619,6 +627,7 @@ class ForestInference(Base,
         return self
 
     @staticmethod
+    @common_load_params_docstring
     def load_from_sklearn(skl_model,
                           output_class=False,
                           threshold=0.50,
@@ -637,7 +646,7 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-        """ + ForestInference.common_load_params_docstring + """
+    {}
 
         Returns
         ----------
@@ -656,6 +665,7 @@ class ForestInference(Base,
         cuml_fm.load_from_treelite_model(model=tl_model, **kwargs)
         return cuml_fm
 
+    @common_load_params_docstring
     @staticmethod
     def load(filename,
              output_class=False,
@@ -678,7 +688,7 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + ForestInference.common_load_params_docstring + """
+    {}
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
             It can be 'xgboost', 'xgboost_json', 'lightgbm'.
@@ -697,6 +707,7 @@ class ForestInference(Base,
         cuml_fm.load_from_treelite_model(model=tl_model, **kwargs)
         return cuml_fm
 
+    @common_load_params_docstring
     def load_using_treelite_handle(self,
                                    model_handle,
                                    output_class=False,
@@ -717,7 +728,7 @@ class ForestInference(Base,
         model_handle : Modelhandle to the treelite forest model
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        """ + ForestInference.common_load_params_docstring + """
+    {}
 
         Returns
         ----------

From 7e9fb875e9752cb1a56dccfd318f9797af15b703 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 23 Jun 2021 16:06:48 -0700
Subject: [PATCH 12/13] addressed review comments; worked around static method
 patching

---
 python/cuml/fil/README.md |  7 ++-
 python/cuml/fil/fil.pyx   | 95 ++++++++++++++++++++++++++++++++++++---
 2 files changed, 91 insertions(+), 11 deletions(-)

diff --git a/python/cuml/fil/README.md b/python/cuml/fil/README.md
index e1e2cad97a..dde61369ea 100644
--- a/python/cuml/fil/README.md
+++ b/python/cuml/fil/README.md
@@ -32,8 +32,8 @@ Additionally, FIL can be called directly from C or C++ code. See [the API docs h
 
 # Features
 
-* Input model source: XGBoost (binary format), cuML RandomForest, scikit-learn RandomForest, LightGBM
-* Model types: Regression, Binary Classification, Multi-class Classification (for cuML Random Forests or GBDTs, but not scikit-learn Random Forests)
+* Input model source: XGBoost (binary format), cuML RandomForest, scikit-learn RandomForest and similar classes, LightGBM
+* Model types: Regression, Binary Classification, Multi-class Classification
 * Tree storage types: Dense or sparse tree storage (see Sparse Forests with FIL blog below)
 * Input formats: Dense, row-major, FP32 arrays on GPU or CPU (e.g. NumPy, cuPy, or other data formats supported by cuML). Trees are expected to be trained for float32 inputs. There may be rounding differences if trees were trained for float64 inputs.
 * High performance batch inference
@@ -42,8 +42,7 @@ Additionally, FIL can be called directly from C or C++ code. See [the API docs h
 Upcoming features:
 
 * Support for multi-class random forests from scikit-learn
-* Support for smaller node storage (8-byte) to reduce memory usage for
-  small trees is experimental
+* Support for 8-byte sparse nodes to reduce memory usage for small trees is experimental
 * Categorical features for LightGBM models
 
 # Benchmarks and performance notes
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 05df7b41cb..ff804d296c 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -414,10 +414,11 @@ class ForestInference(Base,
 
     **Known limitations**:
      * A single row of data should fit into the shared memory of a thread
-       block, which means that more than 5000-12288 features will infer from L1
+       block, otherwise (starting from 5000-12288 features) FIL might infer
+       slower
      * From sklearn.ensemble, only
-       {RandomForest,GradientBoosting,ExtraTrees}{Classifier,Regressor} models
-       are supported. Other sklearn.ensemble models are currently not
+       `{RandomForest,GradientBoosting,ExtraTrees}{Classifier,Regressor}`
+       models are supported. Other sklearn.ensemble models are currently not
        supported.
      * Importing large SKLearn models can be slow, as it is done in Python.
      * LightGBM categorical features are not supported.
@@ -627,7 +628,6 @@ class ForestInference(Base,
         return self
 
     @staticmethod
-    @common_load_params_docstring
     def load_from_sklearn(skl_model,
                           output_class=False,
                           threshold=0.50,
@@ -646,7 +646,48 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-    {}
+        output_class: boolean (default=False)
+            For a Classification model `output_class` must be True.
+            For a Regression model `output_class` must be False.
+        algo : string (default='auto')
+            Name of the algo from (from algo_t enum):
+
+             - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
+               Currently 'BATCH_TREE_REORG' is used for dense storage,
+               and 'NAIVE' for sparse storage
+             - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
+             - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
+               rearranged to be more coalescing-friendly
+             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
+               TREE_REORG but predicting multiple rows per thread block
+
+        threshold : float (default=0.5)
+            Threshold is used to for classification. It is applied
+            only if ``output_class == True``, else it is ignored.
+        storage_type : string or boolean (default='auto')
+            In-memory storage format to be used for the FIL model:
+
+             - ``'auto'``: Choose the storage type automatically
+               (currently DENSE is always used)
+             - ``False``: Create a dense forest
+             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
+               algo='AUTO'
+
+        blocks_per_sm : integer (default=0)
+            (experimental) Indicates how the number of thread blocks to lauch
+            for the inference kernel is determined.
+
+            - ``0`` (default): Launches the number of blocks proportional to
+              the number of data rows
+            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
+              will fail if blocks_per_sm blocks result in more threads than the
+              maximum supported number of threads per GPU. Even if successful,
+              it is not guaranteed that blocks_per_sm blocks will run on an SM
+              concurrently.
+        compute_shape_str : boolean (default=False)
+            if True or equivalent, creates a ForestInference.shape_str
+            (writes a human-readable forest shape description as a
+            multiline ascii string)
 
         Returns
         ----------
@@ -665,7 +706,6 @@ class ForestInference(Base,
         cuml_fm.load_from_treelite_model(model=tl_model, **kwargs)
         return cuml_fm
 
-    @common_load_params_docstring
     @staticmethod
     def load(filename,
              output_class=False,
@@ -688,7 +728,48 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-    {}
+        output_class: boolean (default=False)
+            For a Classification model `output_class` must be True.
+            For a Regression model `output_class` must be False.
+        algo : string (default='auto')
+            Name of the algo from (from algo_t enum):
+
+             - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
+               Currently 'BATCH_TREE_REORG' is used for dense storage,
+               and 'NAIVE' for sparse storage
+             - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
+             - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
+               rearranged to be more coalescing-friendly
+             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
+               TREE_REORG but predicting multiple rows per thread block
+
+        threshold : float (default=0.5)
+            Threshold is used to for classification. It is applied
+            only if ``output_class == True``, else it is ignored.
+        storage_type : string or boolean (default='auto')
+            In-memory storage format to be used for the FIL model:
+
+             - ``'auto'``: Choose the storage type automatically
+               (currently DENSE is always used)
+             - ``False``: Create a dense forest
+             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
+               algo='AUTO'
+
+        blocks_per_sm : integer (default=0)
+            (experimental) Indicates how the number of thread blocks to lauch
+            for the inference kernel is determined.
+
+            - ``0`` (default): Launches the number of blocks proportional to
+              the number of data rows
+            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
+              will fail if blocks_per_sm blocks result in more threads than the
+              maximum supported number of threads per GPU. Even if successful,
+              it is not guaranteed that blocks_per_sm blocks will run on an SM
+              concurrently.
+        compute_shape_str : boolean (default=False)
+            if True or equivalent, creates a ForestInference.shape_str
+            (writes a human-readable forest shape description as a
+            multiline ascii string)
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
             It can be 'xgboost', 'xgboost_json', 'lightgbm'.

From db90edbe0b32f3c59bf3f091228af1aa3c211886 Mon Sep 17 00:00:00 2001
From: Levs Dolgovs <ldolgovs@nvidia.com>
Date: Wed, 30 Jun 2021 00:54:14 -0700
Subject: [PATCH 13/13] X docstring is now generated almost like auto-generated
 docstrings

---
 python/cuml/fil/fil.pyx | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index ff804d296c..1b8067dedf 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -37,6 +37,7 @@ from cuml.common.base import Base
 from cuml.raft.common.handle cimport handle_t
 from cuml.common import input_to_cuml_array, logger
 from cuml.common.mixins import CMajorInputTagMixin
+from cuml.common.doc_utils import _parameters_docstrings
 
 import treelite
 import treelite.sklearn as tl_skl
@@ -528,13 +529,11 @@ class ForestInference(Base,
         return func
 
     def common_predict_params_docstring(func):
-        func.__doc__ = getdoc(func).format("""
-    X : array-like (device or host) shape = (n_samples, n_features)
-       Dense matrix (floats) of shape (n_samples, n_features).
-       Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
-       ndarray, cuda array interface compliant array like CuPy
-       For optimal performance, pass a device array with C-style layout
-    """)
+        func.__doc__ = getdoc(func).format(
+          _parameters_docstrings['dense'].format(
+            name='X', shape='(n_samples, n_features)') +
+          '\n    For optimal performance, pass a float device array '
+          'with C-style layout')
         return func
 
     def __init__(self, *,