rapidsai · rapids-bot · Jun 30, 2021 · Oct 23, 2020 · Nov 12, 2020 · Nov 12, 2020
@@ -429,6 +429,7 @@ void check_params(const forest_params_t* params, bool dense) {
              "leaf_algo must be FLOAT_UNARY_BINARY, CATEGORICAL_LEAF"
              " or GROVE_PER_CLASS");
   }
+  // output_t::RAW == 0, and doesn't have a separate flag
   if ((params->output & ~output_t::ALL_SET) != 0) {
     ASSERT(
       false,

@@ -164,7 +164,9 @@ __device__ __forceinline__ vec<1, output_type> infer_one_tree(
     curr = n.left(curr) + cond;
   }
   vec<1, output_type> out;
-  out[0] = tree[curr].base_node::output<output_type>();
+  /** dependent names are not considered templates by default,
+      unless it's a member of a current [template] instantiation.**/
+  out[0] = tree[curr].template output<output_type>();
   return out;
 }
 
@@ -499,7 +501,7 @@ struct tree_aggregator_t<NITEMS, CATEGORICAL_LEAF> {
   // or class probabilities or regression
   __device__ __forceinline__ void finalize_class_label(float* out,
                                                        int num_rows) {
-    __syncthreads();
+    __syncthreads();  // make sure all votes[] are final
     int item = threadIdx.x;
     int row = item;
     if (item < NITEMS && row < num_rows) {

@@ -33,17 +33,18 @@ Additionally, FIL can be called directly from C or C++ code. See [the API docs h
 # Features
 
 * Input model source: XGBoost (binary format), cuML RandomForest, scikit-learn RandomForest, LightGBM
-* Model types: Regression, Binary Classification, Multi-class Classification (for cuML Random Forests, but not GBDTs or scikit-learn Random Forests)
+* Model types: Regression, Binary Classification, Multi-class Classification (for cuML Random Forests or GBDTs, but not scikit-learn Random Forests)
 * Tree storage types: Dense or sparse tree storage (see Sparse Forests with FIL blog below)
 * Input formats: Dense, row-major, FP32 arrays on GPU or CPU (e.g. NumPy, cuPy, or other data formats supported by cuML). Trees are expected to be trained for float32 inputs. There may be rounding differences if trees were trained for float64 inputs.
 * High performance batch inference
 * Input parsing based on (Treelite)[https://github.com/dmlc/treelite]
 
 Upcoming features:
 
-* Support for multi-class GBDTs is planned for RAPIDS 0.16
+* Support for multi-class random forests from scikit-learn
 * Support for smaller node storage (8-byte) to reduce memory usage for
   small trees is experimental
+* Categorical features for LightGBM models
 
 # Benchmarks and performance notes
 
@@ -74,5 +75,6 @@ GPU, using FIL 0.9.)
 * [RAPIDS Forest Inference Library: Prediction at 100 million rows per second](https://medium.com/rapids-ai/rapids-forest-inference-library-prediction-at-100-million-rows-per-second-19558890bc35)
 * [Sparse Forests with FIL](https://medium.com/rapids-ai/sparse-forests-with-fil-ffbb42b0c7e3
 )
-* [GBM Inferencing on GPU (earlier research work)](https://on-demand.gputechconf.com/gtc/2018/presentation/s8873-gbm-inferencing-on-gpu-v2.pdf)
+* [GBM Inferencing on GPU, 2018 talk (earlier research work)](https://on-demand.gputechconf.com/gtc/2018/presentation/s8873-gbm-inferencing-on-gpu-v2.pdf)
 * [Sample Notebook](https://github.com/rapidsai/cuml/blob/branch-0.16/notebooks/forest_inference_demo.ipynb)
+* [GTC 2021 talk](https://www.nvidia.com/en-us/on-demand/session/gtcspring21-s31296/)
@@ -401,7 +401,7 @@ class ForestInference(Base,
 
     **Known limitations**:
      * A single row of data should fit into the shared memory of a thread
-       block, which means that more than 12288 features are not supported.
+       block, which means that more than 5000-12288 features will infer from L1
      * From sklearn.ensemble, only
        {RandomForest,GradientBoosting,ExtraTrees}{Classifier,Regressor} models
        are supported. Other sklearn.ensemble models are currently not
@@ -466,6 +466,59 @@ class ForestInference(Base,
 
     """
 
+    common_load_params_docstring = """
+    output_class: boolean (default=False)
+        For a Classification model `output_class` must be True.
+        For a Regression model `output_class` must be False.
+    algo : string (default='auto')
+        Name of the algo from (from algo_t enum):
+
+         - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
+           Currently 'BATCH_TREE_REORG' is used for dense storage,
+           and 'NAIVE' for sparse storage
+         - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
+         - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
+           rearranged to be more coalescing-friendly
+         - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
+           TREE_REORG but predicting multiple rows per thread block
+
+    threshold : float (default=0.5)
+        Threshold is used to for classification. It is applied
+        only if ``output_class == True``, else it is ignored.
+    storage_type : string or boolean (default='auto')
+        In-memory storage format to be used for the FIL model:
+
+         - ``'auto'``: Choose the storage type automatically
+           (currently DENSE is always used)
+         - ``False``: Create a dense forest
+         - ``True``: Create a sparse forest. Requires algo='NAIVE' or
+           algo='AUTO'
+
+    blocks_per_sm : integer (default=0)
+        (experimental) Indicates how the number of thread blocks to lauch
+        for the inference kernel is determined.
+
+        - ``0`` (default): Launches the number of blocks proportional to
+          the number of data rows
+        - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
+          will fail if blocks_per_sm blocks result in more threads than the
+          maximum supported number of threads per GPU. Even if successful,
+          it is not guaranteed that blocks_per_sm blocks will run on an SM
+          concurrently.
+    compute_shape_str : boolean (default=False)
+        if True or equivalent, creates a ForestInference.shape_str
+        (writes a human-readable forest shape description as a
+        multiline ascii string)
+    """
+
+    common_predict_params_docstring = """
+    X : array-like (device or host) shape = (n_samples, n_features)
+       Dense matrix (floats) of shape (n_samples, n_features).
+       Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
+       ndarray, cuda array interface compliant array like CuPy
+       For optimal performance, pass a device array with C-style layout
+    """
+
     def __init__(self, *,
                  handle=None,
                  output_type=None,
@@ -486,11 +539,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        X : array-like (device or host) shape = (n_samples, n_features)
-           Dense matrix (floats) of shape (n_samples, n_features).
-           Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
-           ndarray, cuda array interface compliant array like CuPy
-           For optimal performance, pass a device array with C-style layout
+        """ + ForestInference.common_predict_params_docstring + """
         preds: gpuarray or cudf.Series, shape = (n_samples,)
            Optional 'out' location to store inference results
 
@@ -509,11 +558,7 @@ class ForestInference(Base,
 
         Parameters
         ----------
-        X : array-like (device or host) shape = (n_samples, n_features)
-           Dense matrix (floats) of shape (n_samples, n_features).
-           Acceptable formats: cuDF DataFrame, NumPy ndarray, Numba device
-           ndarray, cuda array interface compliant array like CuPy
-           For optimal performance, pass a device array with C-style layout
+        """ + ForestInference.common_predict_params_docstring + """
         preds: gpuarray or cudf.Series, shape = (n_samples,2)
            Binary probability output
            Optional 'out' location to store inference results
@@ -541,53 +586,7 @@ class ForestInference(Base,
             the trained model information in the treelite format
             loaded from a saved model using the treelite API
             https://treelite.readthedocs.io/en/latest/treelite-api.html
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        algo : string (default='auto')
-            Name of the algo from (from algo_t enum):
-
-             - ``'AUTO'`` or ``'auto'``: choose the algorithm automatically.
-               Currently 'BATCH_TREE_REORG' is used for dense storage,
-               and 'NAIVE' for sparse storage
-             - ``'NAIVE'`` or ``'naive'``: simple inference using shared memory
-             - ``'TREE_REORG'`` or ``'tree_reorg'``: similar to naive but trees
-               rearranged to be more coalescing-friendly
-             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: similar to
-               TREE_REORG but predicting multiple rows per thread block
-
-        threshold : float (default=0.5)
-            Threshold is used to for classification. It is applied
-            only if ``output_class == True``, else it is ignored.
-        storage_type : string or boolean (default='auto')
-            In-memory storage format to be used for the FIL model:
-
-             - ``'auto'``: Choose the storage type automatically
-               (currently DENSE is always used)
-             - ``False``: Create a dense forest
-             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
-               algo='AUTO'
-             - ``'sparse8'``: (experimental) Create a sparse forest with 8-byte
-               nodes. Requires algo='NAIVE' or algo='AUTO'. Can fail if 8-byte
-               nodes are not enough to store the forest, e.g. if there are too
-               many nodes in a tree or too many features
-
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
-
+        """ + ForestInference.common_load_params_docstring + """
         Returns
         ----------
         fil_model
@@ -622,48 +621,7 @@ class ForestInference(Base,
         ----------
         skl_model
             The scikit-learn model from which to build the FIL version.
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        algo : string (default='auto')
-            Name of the algo from (from algo_t enum):
-
-             - ``'AUTO'`` or ``'auto'``: Choose the algorithm automatically.
-               Currently 'BATCH_TREE_REORG' is used for dense storage,
-               and 'NAIVE' for sparse storage
-             - ``'NAIVE'`` or ``'naive'``: Simple inference using shared memory
-             - ``'TREE_REORG'`` or ``'tree_reorg'``: Similar to naive but trees
-               rearranged to be more coalescing-friendly
-             - ``'BATCH_TREE_REORG'`` or ``'batch_tree_reorg'``: Similar to
-               TREE_REORG but predicting multiple rows per thread block
-
-        threshold : float (default=0.5)
-            Threshold is used to for classification. It is applied
-            only if ``output_class == True``, else it is ignored.
-        storage_type : string or boolean (default='auto')
-            In-memory storage format to be used for the FIL model:
-
-             - ``'auto'``: Choose the storage type automatically
-               (currently DENSE is always used)
-             - ``False``: Create a dense forest
-             - ``True``: Create a sparse forest. Requires algo='NAIVE' or
-               algo='AUTO'
-
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + ForestInference.common_load_params_docstring + """
 
         Returns
         ----------
@@ -702,33 +660,7 @@ class ForestInference(Base,
             Path to saved model file in a treelite-compatible format
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        threshold : float (default=0.5)
-            Cutoff value above which a prediction is set to 1.0
-            Only used if the model is classification and `output_class` is True
-        algo : string (default='auto')
-            Which inference algorithm to use.
-            See documentation in `FIL.load_from_treelite_model`
-        storage_type : string (default='auto')
-            In-memory storage format to be used for the FIL model.
-            See documentation in `FIL.load_from_treelite_model`
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + ForestInference.common_load_params_docstring + """
         model_type : string (default="xgboost")
             Format of the saved treelite model to be load.
             It can be 'xgboost', 'xgboost_json', 'lightgbm'.
@@ -765,33 +697,7 @@ class ForestInference(Base,
         model_handle : Modelhandle to the treelite forest model
             (See https://treelite.readthedocs.io/en/latest/treelite-api.html
             for more information)
-        output_class: boolean (default=False)
-            For a Classification model `output_class` must be True.
-            For a Regression model `output_class` must be False.
-        threshold : float (default=0.5)
-            Cutoff value above which a prediction is set to 1.0
-            Only used if the model is classification and `output_class` is True
-        algo : string (default='auto')
-            Which inference algorithm to use.
-            See documentation in `FIL.load_from_treelite_model`
-        storage_type : string (default='auto')
-            In-memory storage format to be used for the FIL model.
-            See documentation in `FIL.load_from_treelite_model`
-        blocks_per_sm : integer (default=0)
-            (experimental) Indicates how the number of thread blocks to lauch
-            for the inference kernel is determined.
-
-            - ``0`` (default): Launches the number of blocks proportional to
-              the number of data rows
-            - ``>= 1``: Attempts to lauch blocks_per_sm blocks per SM. This
-              will fail if blocks_per_sm blocks result in more threads than the
-              maximum supported number of threads per GPU. Even if successful,
-              it is not guaranteed that blocks_per_sm blocks will run on an SM
-              concurrently.
-        compute_shape_str : boolean (default=False)
-            if True or equivalent, creates a ForestInference.shape_str
-            (writes a human-readable forest shape description as a
-            multiline ascii string)
+        """ + ForestInference.common_load_params_docstring + """
 
         Returns
         ----------

@@ -70,22 +70,19 @@ def _build_and_save_xgboost(model_path,
     dtrain = xgb.DMatrix(X_train, label=y_train)
 
     # instantiate params
-    params = {'silent': 1}
+    params = {'eval_metric': 'error', 'max_depth': 25}
 
     # learning task params
     if classification:
-        params['eval_metric'] = 'error'
         if n_classes == 2:
             params['objective'] = 'binary:logistic'
         else:
             params['num_class'] = n_classes
             params['objective'] = 'multi:softprob'
     else:
-        params['eval_metric'] = 'error'
         params['objective'] = 'reg:squarederror'
         params['base_score'] = 0.0
 
-    params['max_depth'] = 25
     params.update(xgboost_params)
     bst = xgb.train(params, dtrain, num_rounds)
     bst.save_model(model_path)