From 0b4ccb5230d538889666086cc544f6d0f4cae961 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Mon, 11 May 2020 10:40:45 -0500
Subject: [PATCH 01/32] update cython code

---
 python/cuml/ensemble/randomforest_common.pyx  | 400 ++++++++++++++++++
 python/cuml/ensemble/randomforest_shared.pxd  |   3 +
 .../cuml/ensemble/randomforestclassifier.pyx  | 373 +++++-----------
 .../cuml/ensemble/randomforestregressor.pyx   | 300 +++----------
 4 files changed, 566 insertions(+), 510 deletions(-)
 create mode 100644 python/cuml/ensemble/randomforest_common.pyx

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
new file mode 100644
index 0000000000..cb180402c0
--- /dev/null
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -0,0 +1,400 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import cupy as cp
+from cuml import ForestInference
+from cuml.fil.fil import TreeliteModel as tl
+from cuml.common.handle import Handle
+from cuml.common.base import Base
+
+from cuml.ensemble.randomforest_shared cimport *
+from cuml.utils import input_to_cuml_array, rmm_cupy_ary
+
+cimport cython
+
+
+class BaseRandomForestModel(Base):
+    variables = ['n_estimators', 'max_depth', 'handle',
+                 'max_features', 'n_bins',
+                 'split_algo', 'split_criterion', 'min_rows_per_node',
+                 'min_impurity_decrease',
+                 'bootstrap', 'bootstrap_features',
+                 'verbose', 'rows_sample',
+                 'max_leaves', 'quantile_per_tree']
+
+    def _create_model(self, model, seed, split_criterion,
+                      n_streams, n_estimators=100,
+                      max_depth=16, handle=None, max_features='auto',
+                      n_bins=8, split_algo=1, bootstrap=True,
+                      bootstrap_features=False,
+                      verbose=False, min_rows_per_node=2,
+                      rows_sample=1.0, max_leaves=-1,
+                      accuracy_metric=None, dtype=None,
+                      output_type=None, min_samples_leaf=None,
+                      min_weight_fraction_leaf=None, n_jobs=None,
+                      max_leaf_nodes=None, min_impurity_decrease=0.0,
+                      min_impurity_split=None, oob_score=None,
+                      random_state=None, warm_start=None, class_weight=None,
+                      quantile_per_tree=False, criterion=None):
+
+        if accuracy_metric:
+            model.variables.append('accuracy_metric')
+        sklearn_params = {"criterion": criterion,
+                          "min_samples_leaf": min_samples_leaf,
+                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
+                          "max_leaf_nodes": max_leaf_nodes,
+                          "min_impurity_split": min_impurity_split,
+                          "oob_score": oob_score, "n_jobs": n_jobs,
+                          "random_state": random_state,
+                          "warm_start": warm_start,
+                          "class_weight": class_weight}
+
+        for key, vals in sklearn_params.items():
+            if vals is not None:
+                raise TypeError(" The Scikit-learn variable ", key,
+                                " is not supported in cuML,"
+                                " please read the cuML documentation for"
+                                " more information")
+
+        if handle is None:
+            handle = Handle(n_streams)
+
+        super(model, self).__init__(handle=handle,
+                                    verbose=verbose,
+                                    output_type=output_type)
+        if max_depth < 0:
+            raise ValueError("Must specify max_depth >0 ")
+
+        self.split_algo = split_algo
+        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
+                          '3': MAE, '4': CRITERION_END}
+        if str(split_criterion) not in criterion_dict.keys():
+            warnings.warn("The split criterion chosen was not present"
+                          " in the list of options accepted by the model"
+                          " and so the CRITERION_END option has been chosen.")
+            self.split_criterion = CRITERION_END
+        else:
+            self.split_criterion = criterion_dict[str(split_criterion)]
+
+        self.min_rows_per_node = min_rows_per_node
+        self.min_impurity_decrease = min_impurity_decrease
+        self.bootstrap_features = bootstrap_features
+        self.rows_sample = rows_sample
+        self.max_leaves = max_leaves
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.verbose = verbose
+        self.n_bins = n_bins
+        self.n_cols = None
+        self.dtype = dtype
+        self.accuracy_metric = accuracy_metric
+        self.quantile_per_tree = quantile_per_tree
+        self.n_streams = handle.getNumInternalStreams()
+        self.seed = seed
+        self.model_pbuf_bytes = []
+       # if self.model_type == curfr:
+       # print have a check for the random forest meta data in init
+    """
+    def _check_rf_meta_data_format(self, task_category):
+      if task_category == CLASSIFICATION
+    """
+    def _get_max_feat_val(self):
+        if type(self.max_features) == int:
+            return self.max_features/self.n_cols
+        elif type(self.max_features) == float:
+            return self.max_features
+        elif self.max_features == 'sqrt':
+            return 1/np.sqrt(self.n_cols)
+        elif self.max_features == 'log2':
+            return math.log2(self.n_cols)/self.n_cols
+        elif self.max_features == 'auto':
+            if self.RF_type == CLASSIFICATION:
+                return 1/np.sqrt(self.n_cols)
+            else:
+                return 1.0
+        else:
+            raise ValueError("Wrong value passed in for max_features"
+                             " please read the documentation")
+
+    def check_rf_metadata_type(self):
+        cdef RandomForestMetaData[float, int] *rf_forest_class
+        cdef RandomForestMetaData[double, int] *rf_forest64_class
+        cdef RandomForestMetaData[float, float] *rf_forest_reg
+        cdef RandomForestMetaData[double, double] *rf_forest64_reg
+        if self.RF_type == CLASSIFICATION:
+            rf_forest_class = \
+                new RandomForestMetaData[float, int]()
+            self.rf_forest = <uintptr_t> rf_forest_class
+            rf_forest64_class = \
+                new RandomForestMetaData[double, int]()
+            self.rf_forest64 = <uintptr_t> rf_forest64_class
+        else:
+            rf_forest_reg = \
+                new RandomForestMetaData[float, float]()
+            self.rf_forest = <uintptr_t> rf_forest_reg
+            rf_forest64_reg = \
+                new RandomForestMetaData[double, double]()
+            self.rf_forest64 = <uintptr_t> rf_forest64_reg
+
+    def fit_setup(self, X, y, convert_dtype):
+        self._set_output_type(X)
+
+        # Reset the old tree data for new fit call
+        self._reset_forest_data()
+
+        #cdef uintptr_t X_ptr, y_ptr
+
+        X_m, self.n_rows, self.n_cols, self.dtype = \
+            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
+                                order='F')
+        X_ptr = X_m.ptr
+        print(" type pf X_ptr in common : ", type(X_ptr))
+        if self.RF_type == CLASSIFICATION:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y, check_dtype=np.int32,
+                                    convert_to_dtype=(np.int32 if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+            if y_dtype != np.int32:
+                raise TypeError("The labels `y` need to be of dtype `np.int32`")
+            unique_labels = rmm_cupy_ary(cp.unique, y_m)
+            self.num_classes = len(unique_labels)
+            for i in range(self.num_classes):
+                if i not in unique_labels:
+                    raise ValueError("The labels need "
+                                     "to be consecutive values from "
+                                     "0 to the number of unique label values")
+        else:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y,
+                                    convert_to_dtype=(self.dtype if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+        y_ptr = y_m.ptr
+
+        if self.dtype == np.float64:
+            warnings.warn("To use GPU-based prediction, first train using \
+                          float 32 data to fit the estimator.")
+
+        max_feature_val = self._get_max_feat_val()
+        if type(self.min_rows_per_node) == float:
+            self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows)
+
+        """
+        cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest
+        cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest64
+        if self.RF_type == CLASSIFICATION:
+            *rf_forest = \
+                new RandomForestMetaData[float, int]()
+            self.rf_forest = <uintptr_t> rf_forest
+            *rf_forest64 = \
+                new RandomForestMetaData[double, int]()
+            self.rf_forest64 = <uintptr_t> rf_forest64
+        else:
+            *rf_forest = \
+                new RandomForestMetaData[float, float]()
+            self.rf_forest = <uintptr_t> rf_forest
+            *rf_forest64 = \
+                new RandomForestMetaData[double, double]()
+            self.rf_forest64 = <uintptr_t> rf_forest64
+
+        if self.dtype == np.float32:
+            fit(handle_[0],
+                rf_forest,
+                <float*> X_ptr,
+                <int> self.n_rows,
+                <int> self.n_cols,
+                <float*> y_ptr,
+                rf_params,
+                <int> self.verbosity)
+        else:
+            rf_params64 = rf_params
+            fit(handle_[0],
+                rf_forest64,
+                <double*> X_ptr,
+                <int> self.n_rows,
+                <int> self.n_cols,
+                <double*> y_ptr,
+                rf_params64,
+                <int> self.verbosity)
+        """
+        return X_m, y_m, max_feature_val
+
+    def _predict_model_on_gpu(self, model, X, algo, convert_dtype,
+                              fil_sparse_format, threshold=0.5,
+                              output_class=False, predict_proba=False):
+        out_type = self._get_output_type(X)
+        cdef ModelHandle cuml_model_ptr = NULL
+        _, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='F',
+                                check_cols=self.n_cols)
+
+        if dtype == np.float64 and not convert_dtype:
+            raise TypeError("GPU based predict only accepts np.float32 data. \
+                            Please set convert_dtype=True to convert the test \
+                            data to the same dtype as the data used to train, \
+                            ie. np.float32. If you would like to use test \
+                            data of dtype=np.float64 please set \
+                            predict_model='CPU' to use the CPU implementation \
+                            of predict.")
+
+        model._obtain_treelite_handle()
+        storage_type = \
+            _check_fil_parameter_validity(depth=self.max_depth,
+                                          fil_sparse_format=fil_sparse_format,
+                                          algo=algo)
+
+        fil_model = ForestInference()
+        tl_to_fil_model = \
+            fil_model.load_from_randomforest(self.treelite_handle,
+                                             output_class=output_class,
+                                             threshold=threshold,
+                                             algo=algo,
+                                             storage_type=storage_type)
+
+        preds = tl_to_fil_model.predict(X, output_type=out_type,
+                                        predict_proba=predict_proba)
+        tl.free_treelite_model(self.treelite_handle)
+        return preds
+
+    def _get_params(self, model, deep):
+        params = dict()
+        for key in model.variables:
+            if key in ['handle']:
+                continue
+            var_value = getattr(self, key, None)
+            params[key] = var_value
+        return params
+
+    def _set_params(self, model, **params):
+        self.handle.__setstate__(self.n_streams)
+        self.model_pbuf_bytes = []
+
+        if not params:
+            return self
+        for key, value in params.items():
+            if key not in model.variables:
+                raise ValueError('Invalid parameter for estimator')
+            else:
+                setattr(self, key, value)
+        return self
+
+    """
+    def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type):
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef rf_class_float *rf_forest_class
+        cdef rf_reg_float *rf_forest_reg
+        if task_category == CLASSIFICATION:
+            rf_forest_class = \
+                <rf_meta_type*><uintptr_t> self.rf_forest
+
+        else:
+            rf_forest_reg = \
+                <rf_meta_type*><uintptr_t> self.rf_forest
+        build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr,
+                                  rf_forest_reg,
+                                  <int> self.n_cols,
+                                  <int> task_category,
+                                  <vector[unsigned char] &> self.model_pbuf_bytes)
+        mod_ptr = <size_t> cuml_model_ptr
+        treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return treelite_handle
+
+    """
+    def _get_protobuf_bytes_common(self, model):
+        fit_mod_ptr = model._obtain_treelite_handle()
+        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
+        model_protobuf_bytes = save_model(<ModelHandle> model_ptr)
+        return model_protobuf_bytes
+
+
+def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
+    storage_format = _check_fil_sparse_format_value(fil_sparse_format)
+    if (depth > 16 and (storage_format == 'dense' or
+                        algo == 'tree_reorg' or
+                        algo == 'batch_tree_reorg')):
+        raise ValueError("While creating a forest with max_depth greater "
+                         "than 16, `fil_sparse_format` should be True. "
+                         "If `fil_sparse_format=False` then the memory"
+                         "consumed while creating the FIL forest is very "
+                         "large and the process will be aborted. In "
+                         "addition, `algo` must be either set to `naive' "
+                         "or `auto` to set 'fil_sparse_format=True`.")
+    return storage_format
+
+
+def _check_fil_sparse_format_value(fil_sparse_format):
+    accepted_vals = [True, False, 'auto']
+    if fil_sparse_format == 'auto':
+        storage_format = fil_sparse_format
+    elif not fil_sparse_format:
+        storage_format = 'dense'
+    elif fil_sparse_format not in accepted_vals:
+        raise ValueError("The value entered for spares_forest is not "
+                         "supported. Please refer to the documentation "
+                         "to see the accepted values.")
+    else:
+        storage_format = 'sparse'
+
+    return storage_format
+
+
+def _obtain_treelite_model(treelite_handle):
+    """
+    Creates a Treelite model using the treelite handle
+    obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    tl_to_fil_model : Treelite version of this model
+    """
+    treelite_model = \
+        tl.from_treelite_model_handle(treelite_handle)
+    return treelite_model
+
+
+def _obtain_fil_model(treelite_handle, depth,
+                      output_class=True,
+                      threshold=0.5, algo='auto',
+                      fil_sparse_format='auto'):
+    """
+    Creates a Forest Inference (FIL) model using the treelite
+    handle obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    fil_model :
+        A Forest Inference model which can be used to perform
+        inferencing on the random forest model.
+    """
+
+    storage_format = \
+        _check_fil_parameter_validity(depth=depth,
+                                      fil_sparse_format=fil_sparse_format,
+                                      algo=algo)
+
+    fil_model = ForestInference()
+    tl_to_fil_model = \
+        fil_model.load_from_randomforest(treelite_handle,
+                                         output_class=output_class,
+                                         threshold=threshold,
+                                         algo=algo,
+                                         storage_type=storage_format)
+
+    return tl_to_fil_model
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index 4ea76c92b7..10c0657030 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -37,6 +37,8 @@ from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \
     input_to_dev_array, zeros
 cimport cuml.common.handle
 cimport cuml.common.cuda
+cimport cython
+
 
 cdef extern from "treelite/c_api.h":
     ctypedef void* ModelHandle
@@ -127,3 +129,4 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
 
     cdef ModelHandle concatenate_trees(
         vector[ModelHandle] &treelite_handles) except +
+
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 0bfebaf950..aea7768da1 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -1,3 +1,4 @@
+
 #
 # Copyright (c) 2019-2020, NVIDIA CORPORATION.
 #
@@ -38,6 +39,8 @@ from cuml import ForestInference
 from cuml.common.array import CumlArray
 from cuml.common.base import Base
 from cuml.common.handle import Handle
+from cuml.ensemble.randomforest_common import BaseRandomForestModel
+
 from cuml.common.handle cimport cumlHandle
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
     _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
@@ -50,6 +53,7 @@ from numba import cuda
 
 cimport cuml.common.handle
 cimport cuml.common.cuda
+cimport cython
 
 cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
 
@@ -120,7 +124,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                           bool) except +
 
 
-class RandomForestClassifier(Base):
+class RandomForestClassifier(BaseRandomForestModel):
     """
     Implements a Random Forest classifier model which fits multiple decision
     tree classifiers in an ensemble.
@@ -219,88 +223,20 @@ class RandomForestClassifier(Base):
     seed : int (default = None)
         Seed for the random number generator. Unseeded by default.
     """
+    def __init__(self, split_criterion=0, seed=None,
+                 n_streams=8, **kwargs):
 
-    variables = ['n_estimators', 'max_depth', 'handle',
-                 'max_features', 'n_bins',
-                 'split_algo', 'split_criterion', 'min_rows_per_node',
-                 'min_impurity_decrease',
-                 'bootstrap', 'bootstrap_features',
-                 'verbose', 'rows_sample',
-                 'max_leaves', 'quantile_per_tree']
-
-    def __init__(self, n_estimators=100, max_depth=16, handle=None,
-                 max_features='auto', n_bins=8, n_streams=8,
-                 split_algo=1, split_criterion=0, min_rows_per_node=2,
-                 bootstrap=True, bootstrap_features=False,
-                 type_model="classifier", verbose=False,
-                 rows_sample=1.0, max_leaves=-1, quantile_per_tree=False,
-                 output_type=None, criterion=None, dtype=None,
-                 min_samples_leaf=None, min_weight_fraction_leaf=None,
-                 max_leaf_nodes=None, min_impurity_decrease=0.0,
-                 min_impurity_split=None, oob_score=None, n_jobs=None,
-                 random_state=None, warm_start=None, class_weight=None,
-                 seed=None):
-        sklearn_params = {"criterion": criterion,
-                          "min_samples_leaf": min_samples_leaf,
-                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
-                          "max_leaf_nodes": max_leaf_nodes,
-                          "min_impurity_split": min_impurity_split,
-                          "oob_score": oob_score, "n_jobs": n_jobs,
-                          "random_state": random_state,
-                          "warm_start": warm_start,
-                          "class_weight": class_weight}
-
-        for key, vals in sklearn_params.items():
-            if vals is not None:
-                raise TypeError("The Scikit-learn variable", key,
-                                " is not supported in cuML,"
-                                " please read the cuML documentation for"
-                                " more information")
-
-        if max_depth < 0:
-            raise ValueError("Must specify max_depth >0")
-
-        if handle is None:
-            handle = Handle(n_streams)
-
-        super(RandomForestClassifier, self).__init__(handle=handle,
-                                                     verbose=verbose,
-                                                     output_type=output_type)
-
-        self.split_algo = split_algo
-        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                          '3': MAE, '4': CRITERION_END}
-        if str(split_criterion) not in criterion_dict.keys():
-            warnings.warn("The split criterion chosen was not present"
-                          " in the list of options accepted by the model"
-                          " and so the CRITERION_END option has been chosen.")
-            self.split_criterion = CRITERION_END
-        else:
-            self.split_criterion = criterion_dict[str(split_criterion)]
-
-        self.min_rows_per_node = min_rows_per_node
-        self.min_impurity_decrease = min_impurity_decrease
-        self.bootstrap_features = bootstrap_features
-        self.rows_sample = rows_sample
-        self.max_leaves = max_leaves
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.max_features = max_features
-        self.bootstrap = bootstrap
-        self.verbose = verbose
-        self.n_bins = n_bins
-        self.quantile_per_tree = quantile_per_tree
-        self.n_cols = None
-        self.dtype = None
-        self.n_streams = handle.getNumInternalStreams()
-        self.seed = seed
-        self.num_classes = 2
         if ((seed is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results, n_streams==1 is "
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "
                           "random_seed is set")
-        self.model_pbuf_bytes = []
+        self.RF_type = CLASSIFICATION
+        self.num_classes = 2
+        self._create_model(model=RandomForestClassifier,
+                           split_criterion=split_criterion,
+                           seed=seed, n_streams=n_streams,
+                           **kwargs)
 
     """
     TODO:
@@ -319,10 +255,10 @@ class RandomForestClassifier(Base):
         if self.n_cols:
             # only if model has been fit previously
             self.model_pbuf_bytes = self._get_protobuf_bytes()
-            params_t = <size_t> self.rf_forest
+            params_t = <uintptr_t> self.rf_forest
             rf_forest = \
                 <RandomForestMetaData[float, int]*>params_t
-            params_t64 = <size_t> self.rf_forest64
+            params_t64 = <uintptr_t> self.rf_forest64
             rf_forest64 = \
                 <RandomForestMetaData[double, int]*>params_t64
             if self.dtype == np.float32:
@@ -347,10 +283,10 @@ class RandomForestClassifier(Base):
         if self.n_cols:
             if state["dtype"] == np.float32:
                 rf_forest.rf_params = state["rf_params"]
-                state["rf_forest"] = <size_t>rf_forest
+                state["rf_forest"] = <uintptr_t>rf_forest
             else:
                 rf_forest64.rf_params = state["rf_params64"]
-                state["rf_forest64"] = <size_t>rf_forest64
+                state["rf_forest64"] = <uintptr_t>rf_forest64
 
         self.model_pbuf_bytes = state["model_pbuf_bytes"]
         self.__dict__.update(state)
@@ -358,60 +294,74 @@ class RandomForestClassifier(Base):
     def __del__(self):
         if self.n_cols:
             if self.dtype == np.float32:
-                free(<RandomForestMetaData[float, int]*><size_t>
+                free(<RandomForestMetaData[float, int]*><uintptr_t>
                      self.rf_forest)
             else:
-                free(<RandomForestMetaData[double, int]*><size_t>
+                free(<RandomForestMetaData[double, int]*><uintptr_t>
                      self.rf_forest64)
 
     def _reset_forest_data(self):
         # Only if model is fitted before
         # Clears the data of the forest to prepare for next fit
         if self.n_cols:
-            free(<RandomForestMetaData[float, int]*><size_t>
+            free(<RandomForestMetaData[float, int]*><uintptr_t>
                  self.rf_forest)
-            free(<RandomForestMetaData[double, int]*><size_t>
+            free(<RandomForestMetaData[double, int]*><uintptr_t>
                  self.rf_forest64)
 
-    def _get_max_feat_val(self):
-        if type(self.max_features) == int:
-            return self.max_features/self.n_cols
-        elif type(self.max_features) == float:
-            return self.max_features
-        elif self.max_features == 'sqrt' or self.max_features == 'auto':
-            return 1/np.sqrt(self.n_cols)
-        elif self.max_features == 'log2':
-            return math.log2(self.n_cols)/self.n_cols
-        else:
-            raise ValueError("Wrong value passed in for max_features"
-                             " please read the documentation")
-
     def _obtain_treelite_handle(self):
-        task_category = CLASSIFICATION_MODEL
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef RandomForestMetaData[float, int] *rf_forest = \
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
         if self.num_classes > 2:
             raise NotImplementedError("Pickling for multi-class "
                                       "classification models is currently not "
                                       "implemented. Please check cuml issue "
                                       "#1679 for more information.")
-
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return treelite_handle
+        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
+        cdef vector[unsigned char] model_pbuf_vec
+        with cython.boundscheck(False):
+            model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
+        if self.treelite_handle is None:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+            mod_ptr = <uintptr_t> cuml_model_ptr
+            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return self.treelite_handle
 
     def _get_protobuf_bytes(self):
-        fit_mod_ptr = self._obtain_treelite_handle()
+        """
+        Returns the self.model_pbuf_bytes.
+        Cuml RF model gets converted to treelite protobuf bytes by:
+            1. converting the cuml RF model to a treelite model. The treelite
+            models handle (pointer) is returned
+            2. The treelite model handle is used to convert the treelite model
+            to a treelite protobuf model which is stored in a temporary file.
+            The protobuf model information is read from the temporary file and
+            the byte information is returned.
+        The treelite handle is stored `self.treelite_handle` and the treelite
+        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
+        of information is already present in the model then the respective
+        step is skipped.
+        """
+        if self.model_pbuf_bytes:
+            return self.model_pbuf_bytes
+        elif self.treelite_handle:
+            fit_mod_ptr = self.treelite_handle
+        else:
+            fit_mod_ptr = self._obtain_treelite_handle()
         cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        model_protobuf_bytes = save_model(<ModelHandle> model_ptr)
-
-        return model_protobuf_bytes
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self.model_pbuf_bytes
 
     def convert_to_treelite_model(self):
         """
@@ -421,8 +371,8 @@ class RandomForestClassifier(Base):
         ----------
         tl_to_fil_model : Treelite version of this model
         """
-        treelite_handle = self._obtain_treelite_handle()
-        return _obtain_treelite_model(treelite_handle)
+        handle = self._obtain_treelite_handle()
+        return _obtain_treelite_model(handle)
 
     def convert_to_fil_model(self, output_class=True,
                              threshold=0.5, algo='auto',
@@ -470,7 +420,6 @@ class RandomForestClassifier(Base):
             A Forest Inference model which can be used to perform
             inferencing on the random forest model.
         """
-
         treelite_handle = self._obtain_treelite_handle()
         return _obtain_fil_model(treelite_handle=treelite_handle,
                                  depth=self.max_depth,
@@ -479,8 +428,6 @@ class RandomForestClassifier(Base):
                                  algo=algo,
                                  fil_sparse_format=fil_sparse_format)
 
-        return tl_to_fil_model
-
     """
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
@@ -488,14 +435,14 @@ class RandomForestClassifier(Base):
     def _tl_model_handles(self, model_bytes):
         cdef ModelHandle cuml_model_ptr = NULL
         cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
         task_category = CLASSIFICATION_MODEL
         build_treelite_forest(& cuml_model_ptr,
                               rf_forest,
                               <int> self.n_cols,
                               <int> task_category,
                               <vector[unsigned char] &> model_bytes)
-        mod_handle = <size_t> cuml_model_ptr
+        mod_handle = <uintptr_t> cuml_model_ptr
 
         return ctypes.c_void_p(mod_handle).value
 
@@ -510,14 +457,14 @@ class RandomForestClassifier(Base):
                 <ModelHandle> mod_ptr))
 
         concat_model_handle = concatenate_trees(deref(model_handles))
-
-        concat_model_ptr = <size_t> concat_model_handle
-        return ctypes.c_void_p(concat_model_ptr).value
-
-    def _concatenate_model_bytes(self, concat_model_handle):
-        cdef uintptr_t model_ptr = <uintptr_t> concat_model_handle
-        concat_model_bytes = save_model(<ModelHandle> model_ptr)
-        self._model_pbuf_bytes = concat_model_bytes
+        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
+        self.treelite_handle = concat_model_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> concat_model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -540,53 +487,19 @@ class RandomForestClassifier(Base):
             memory used for the method.
 
         """
-        self._set_output_type(X)
-
-        # Reset the old tree data for new fit call
-        self._reset_forest_data()
-
         cdef uintptr_t X_ptr, y_ptr
-
-        X_m, n_rows, self.n_cols, self.dtype = \
-            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
-                                order='F')
+        X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype)
         X_ptr = X_m.ptr
-
-        y_m, _, _, y_dtype = \
-            input_to_cuml_array(y, check_dtype=np.int32,
-                                convert_to_dtype=(np.int32 if convert_dtype
-                                                  else None),
-                                check_rows=n_rows, check_cols=1)
         y_ptr = y_m.ptr
-        if y_dtype != np.int32:
-            raise TypeError("The labels `y` need to be of dtype `np.int32`")
-
-        if self.dtype == np.float64:
-            warnings.warn("To use GPU-based prediction, first train \
-                          using float 32 data to fit the estimator.")
-
         cdef cumlHandle* handle_ =\
             <cumlHandle*><size_t>self.handle.getHandle()
 
-        unique_labels = rmm_cupy_ary(cp.unique, y_m)
-        num_unique_labels = len(unique_labels)
-
-        for i in range(num_unique_labels):
-            if i not in unique_labels:
-                raise ValueError("The labels need "
-                                 "to be consecutive values from "
-                                 "0 to the number of unique label values")
-
-        max_feature_val = self._get_max_feat_val()
-        if type(self.min_rows_per_node) == float:
-            self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows)
-
         cdef RandomForestMetaData[float, int] *rf_forest = \
             new RandomForestMetaData[float, int]()
-        self.rf_forest = <size_t> rf_forest
+        self.rf_forest = <uintptr_t> rf_forest
         cdef RandomForestMetaData[double, int] *rf_forest64 = \
             new RandomForestMetaData[double, int]()
-        self.rf_forest64 = <size_t> rf_forest64
+        self.rf_forest64 = <uintptr_t> rf_forest64
 
         if self.seed is None:
             seed_val = <uintptr_t>NULL
@@ -608,14 +521,15 @@ class RandomForestClassifier(Base):
                                      <CRITERION> self.split_criterion,
                                      <bool> self.quantile_per_tree,
                                      <int> self.n_streams)
+
         if self.dtype == np.float32:
             fit(handle_[0],
                 rf_forest,
                 <float*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <int*> y_ptr,
-                <int> num_unique_labels,
+                <int> self.num_classes,
                 rf_params,
                 <int> self.verbosity)
 
@@ -624,10 +538,10 @@ class RandomForestClassifier(Base):
             fit(handle_[0],
                 rf_forest64,
                 <double*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <int*> y_ptr,
-                <int> num_unique_labels,
+                <int> self.num_classes,
                 rf_params64,
                 <int> self.verbosity)
 
@@ -638,59 +552,10 @@ class RandomForestClassifier(Base):
         # make sure that the `fit` is complete before the following delete
         # call happens
         self.handle.sync()
-        del(X_m)
-        del(y_m)
-        self.num_classes = num_unique_labels
+        del X_m
+        del y_m
         return self
 
-    def _predict_model_on_gpu(self, X, output_class,
-                              threshold, algo,
-                              num_classes, convert_dtype,
-                              fil_sparse_format, predict_proba):
-        out_type = self._get_output_type(X)
-        cdef ModelHandle cuml_model_ptr = NULL
-        _, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='F',
-                                check_cols=self.n_cols)
-
-        if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            Please set convert_dtype=True to convert the test \
-                            data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
-                            predict_model='CPU' to use the CPU implementation \
-                            of predict.")
-
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
-
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> n_cols,
-                              <int> num_classes,
-                              <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-
-        storage_type = \
-            _check_fil_parameter_validity(depth=self.max_depth,
-                                          fil_sparse_format=fil_sparse_format,
-                                          algo=algo)
-
-        fil_model = ForestInference()
-        tl_to_fil_model = \
-            fil_model.load_from_randomforest(treelite_handle,
-                                             output_class=output_class,
-                                             threshold=threshold,
-                                             algo=algo,
-                                             storage_type=storage_type)
-
-        preds = tl_to_fil_model.predict(X, output_type=out_type,
-                                        predict_proba=predict_proba)
-        tl.free_treelite_model(treelite_handle)
-        return preds
-
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr
@@ -705,13 +570,13 @@ class RandomForestClassifier(Base):
         cdef uintptr_t preds_ptr = preds.ptr
 
         cdef cumlHandle* handle_ =\
-            <cumlHandle*><size_t>self.handle.getHandle()
+            <cumlHandle*><uintptr_t>self.handle.getHandle()
 
         cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
 
         cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><size_t> self.rf_forest64
+            <RandomForestMetaData[double, int]*><uintptr_t> self.rf_forest64
         if self.dtype == np.float32:
             predict(handle_[0],
                     rf_forest,
@@ -817,10 +682,10 @@ class RandomForestClassifier(Base):
 
         else:
             preds = \
-                self._predict_model_on_gpu(X, output_class=output_class,
+                self._predict_model_on_gpu(model=RandomForestClassifier,
+                                           X=X, output_class=output_class,
                                            threshold=threshold,
                                            algo=algo,
-                                           num_classes=num_classes,
                                            convert_dtype=convert_dtype,
                                            fil_sparse_format=fil_sparse_format,
                                            predict_proba=False)
@@ -856,12 +721,12 @@ class RandomForestClassifier(Base):
         preds_ptr = preds.ptr
 
         cdef cumlHandle* handle_ =\
-            <cumlHandle*><size_t>self.handle.getHandle()
+            <cumlHandle*><uintptr_t>self.handle.getHandle()
         cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
 
         cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><size_t> self.rf_forest64
+            <RandomForestMetaData[double, int]*><uintptr_t> self.rf_forest64
         if self.dtype == np.float32:
             predictGetAll(handle_[0],
                           rf_forest,
@@ -1050,13 +915,13 @@ class RandomForestClassifier(Base):
         preds_ptr = preds_m.ptr
 
         cdef cumlHandle* handle_ =\
-            <cumlHandle*><size_t>self.handle.getHandle()
+            <cumlHandle*><uintptr_t>self.handle.getHandle()
 
         cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
 
         cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><size_t> self.rf_forest64
+            <RandomForestMetaData[double, int]*><uintptr_t> self.rf_forest64
 
         if self.dtype == np.float32:
             self.stats = score(handle_[0],
@@ -1091,13 +956,10 @@ class RandomForestClassifier(Base):
         -----------
         deep : boolean (default = True)
         """
-        params = dict()
-        for key in RandomForestClassifier.variables:
-            if key in ['handle']:
-                continue
-            var_value = getattr(self, key, None)
-            params[key] = var_value
-        return params
+
+
+        return self._get_params(model=RandomForestClassifier,
+                                deep=deep)
 
     def set_params(self, **params):
         """
@@ -1110,27 +972,20 @@ class RandomForestClassifier(Base):
         params : dict of new params
         """
         # Resetting handle as __setstate__ overwrites with handle=None
-        self.handle.__setstate__(self.n_streams)
-        self.model_pbuf_bytes = []
-
-        if not params:
-            return self
-        for key, value in params.items():
-            if key not in RandomForestClassifier.variables:
-                raise ValueError('Invalid parameter for estimator')
-            else:
-                setattr(self, key, value)
-        return self
+
+
+        return self._set_params(model=RandomForestClassifier,
+                                **params)
 
     def print_summary(self):
         """
         Prints the summary of the forest used to train and test the model
         """
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+        cdef RandomForestMetaData[float, float] *rf_forest = \
+            <RandomForestMetaData[float, float]*><size_t> self.rf_forest
 
-        cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><size_t> self.rf_forest64
+        cdef RandomForestMetaData[double, double] *rf_forest64 = \
+            <RandomForestMetaData[double, double]*><size_t> self.rf_forest64
 
         if self.dtype == np.float64:
             print_rf_summary(rf_forest64)
@@ -1142,11 +997,11 @@ class RandomForestClassifier(Base):
         Prints the detailed information about the forest used to
         train and test the Random Forest model
         """
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+        cdef RandomForestMetaData[float, float] *rf_forest = \
+            <RandomForestMetaData[float, float]*><size_t> self.rf_forest
 
-        cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><size_t> self.rf_forest64
+        cdef RandomForestMetaData[double, double] *rf_forest64 = \
+            <RandomForestMetaData[double, double]*><size_t> self.rf_forest64
 
         if self.dtype == np.float64:
             print_rf_detailed(rf_forest64)
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 0feb5dabf0..2d64f0718a 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -34,9 +34,10 @@ from cuml import ForestInference
 from cuml.common.array import CumlArray
 from cuml.common.base import Base
 from cuml.common.handle import Handle
+from cuml.ensemble.randomforest_common import BaseRandomForestModel
 from cuml.common.handle cimport cumlHandle
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
-    _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
+    _obtain_treelite_model, _obtain_fil_model
 
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.fil.fil import TreeliteModel as tl
@@ -51,7 +52,7 @@ cimport cuml.common.cuda
 
 
 cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
-
+    
     cdef void fit(cumlHandle & handle,
                   RandomForestMetaData[float, float]*,
                   float*,
@@ -69,7 +70,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   double*,
                   RF_params,
                   int) except +
-
+    
     cdef void predict(cumlHandle& handle,
                       RandomForestMetaData[float, float] *,
                       float*,
@@ -101,7 +102,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                           int) except +
 
 
-class RandomForestRegressor(Base):
+class RandomForestRegressor(BaseRandomForestModel):
 
     """
     Implements a Random Forest regressor model which fits multiple decision
@@ -207,87 +208,19 @@ class RandomForestRegressor(Base):
         currently fully guarantee the exact same results.
 
     """
-    variables = ['n_estimators', 'max_depth', 'handle',
-                 'max_features', 'n_bins',
-                 'split_algo', 'split_criterion', 'min_rows_per_node',
-                 'min_impurity_decrease',
-                 'bootstrap', 'bootstrap_features',
-                 'verbose', 'rows_sample',
-                 'max_leaves', 'quantile_per_tree',
-                 'accuracy_metric']
-
-    def __init__(self, n_estimators=100, max_depth=16, handle=None,
-                 max_features='auto', n_bins=8, n_streams=8,
-                 split_algo=1, split_criterion=2,
-                 bootstrap=True, bootstrap_features=False,
-                 verbose=False, min_rows_per_node=2,
-                 rows_sample=1.0, max_leaves=-1,
-                 accuracy_metric='mse', output_type=None,
-                 min_samples_leaf=None, dtype=None,
-                 min_weight_fraction_leaf=None, n_jobs=None,
-                 max_leaf_nodes=None, min_impurity_decrease=0.0,
-                 min_impurity_split=None, oob_score=None,
-                 random_state=None, warm_start=None, class_weight=None,
-                 quantile_per_tree=False, criterion=None, seed=None):
-        sklearn_params = {"criterion": criterion,
-                          "min_samples_leaf": min_samples_leaf,
-                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
-                          "max_leaf_nodes": max_leaf_nodes,
-                          "min_impurity_split": min_impurity_split,
-                          "oob_score": oob_score, "n_jobs": n_jobs,
-                          "random_state": random_state,
-                          "warm_start": warm_start,
-                          "class_weight": class_weight}
-
-        for key, vals in sklearn_params.items():
-            if vals is not None:
-                raise TypeError(" The Scikit-learn variable ", key,
-                                " is not supported in cuML,"
-                                " please read the cuML documentation for"
-                                " more information")
-
-        if handle is None:
-            handle = Handle(n_streams)
-
-        super(RandomForestRegressor, self).__init__(handle=handle,
-                                                    verbose=verbose,
-                                                    output_type=output_type)
-
-        if max_depth < 0:
-            raise ValueError("Must specify max_depth >0 ")
-
-        self.split_algo = split_algo
-        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                          '3': MAE, '4': CRITERION_END}
-        if str(split_criterion) not in criterion_dict.keys():
-            warnings.warn("The split criterion chosen was not present"
-                          " in the list of options accepted by the model"
-                          " and so the CRITERION_END option has been chosen.")
-            self.split_criterion = CRITERION_END
-        else:
-            self.split_criterion = criterion_dict[str(split_criterion)]
-
-        self.min_rows_per_node = min_rows_per_node
-        self.min_impurity_decrease = min_impurity_decrease
-        self.bootstrap_features = bootstrap_features
-        self.rows_sample = rows_sample
-        self.max_leaves = max_leaves
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.max_features = max_features
-        self.bootstrap = bootstrap
-        self.verbose = verbose
-        self.n_bins = n_bins
-        self.n_cols = None
-        self.dtype = None
-        self.accuracy_metric = accuracy_metric
-        self.quantile_per_tree = quantile_per_tree
-        self.n_streams = handle.getNumInternalStreams()
-        self.seed = seed
+    def __init__(self, split_criterion=2, seed=None,
+                 accuracy_metric='mse', n_streams=8,
+                 **kwargs):
+
         if ((seed is not None) and (n_streams != 1)):
             warnings.warn("Setting the random seed does not fully guarantee"
                           " the exact same results at this time.")
-        self.model_pbuf_bytes = []
+        self.RF_type = REGRESSION
+        self._create_model(model=RandomForestRegressor,
+                           split_criterion=split_criterion,
+                           seed=seed, n_streams=n_streams,
+                           accuracy_metric=accuracy_metric,
+                           **kwargs)
 
     """
     TODO:
@@ -358,34 +291,26 @@ class RandomForestRegressor(Base):
             free(<RandomForestMetaData[double, double]*><size_t>
                  self.rf_forest64)
 
-    def _get_max_feat_val(self):
-        if type(self.max_features) == int:
-            return self.max_features/self.n_cols
-        elif type(self.max_features) == float:
-            return self.max_features
-        elif self.max_features == 'sqrt':
-            return 1/np.sqrt(self.n_cols)
-        elif self.max_features == 'auto':
-            return 1.0
-        elif self.max_features == 'log2':
-            return math.log2(self.n_cols)/self.n_cols
-        else:
-            raise ValueError("Wrong value passed in for max_features"
-                             " please read the documentation")
-
     def _obtain_treelite_handle(self):
-        task_category = REGRESSION_MODEL
         cdef ModelHandle cuml_model_ptr = NULL
         cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><size_t> self.rf_forest
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return treelite_handle
+            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
+        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
+        cdef vector[unsigned char] model_pbuf_vec
+        with cython.boundscheck(False):
+            model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
+        if self.treelite_handle is None:
+            task_category = REGRESSION_MODEL
+            build_treelite_forest(
+                & cuml_model_ptr,
+                rf_forest,
+                <int> self.n_cols,
+                <int> task_category,
+                model_pbuf_vec)
+            mod_ptr = <uintptr_t> cuml_model_ptr
+            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return self.treelite_handle
 
     def _get_protobuf_bytes(self):
         fit_mod_ptr = self._obtain_treelite_handle()
@@ -404,60 +329,12 @@ class RandomForestRegressor(Base):
         treelite_handle = self._obtain_treelite_handle()
         return _obtain_treelite_model(treelite_handle)
 
-    def convert_to_fil_model(self, output_class=False,
-                             algo='auto',
-                             fil_sparse_format='auto'):
-        """
-        Create a Forest Inference (FIL) model from the trained cuML
-        Random Forest model.
-
-        Parameters
-        ----------
-        output_class : boolean (default = True)
-            This is optional and required only while performing the
-            predict operation on the GPU.
-            If true, return a 1 or 0 depending on whether the raw
-            prediction exceeds the threshold. If False, just return
-            the raw prediction.
-        algo : string (default = 'auto')
-            This is optional and required only while performing the
-            predict operation on the GPU.
-            'naive' - simple inference using shared memory
-            'tree_reorg' - similar to naive but trees rearranged to be more
-            coalescing-friendly
-            'batch_tree_reorg' - similar to tree_reorg but predicting
-            multiple rows per thread block
-            `auto` - choose the algorithm automatically. Currently
-            'batch_tree_reorg' is used for dense storage
-            and 'naive' for sparse storage
-        fil_sparse_format : boolean or string (default = auto)
-            This variable is used to choose the type of forest that will be
-            created in the Forest Inference Library. It is not required
-            while using predict_model='CPU'.
-            'auto' - choose the storage type automatically
-            (currently True is chosen by auto)
-            False - create a dense forest
-            True - create a sparse forest, requires algo='naive'
-            or algo='auto'
-
-        Returns
-        ----------
-        fil_model :
-            A Forest Inference model which can be used to perform
-            inferencing on the random forest model.
-
-        """
-        treelite_handle = self._obtain_treelite_handle()
-        return _obtain_fil_model(treelite_handle=treelite_handle,
-                                 depth=self.max_depth,
-                                 output_class=output_class,
-                                 algo=algo,
-                                 fil_sparse_format=fil_sparse_format)
 
     """
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
     """
+    """
     def _tl_model_handles(self, model_bytes):
         task_category = REGRESSION_MODEL
         cdef ModelHandle tl_model_ptr = NULL
@@ -468,7 +345,7 @@ class RandomForestRegressor(Base):
                               <int> self.n_cols,
                               <int> task_category,
                               <vector[unsigned char] &> model_bytes)
-        mod_handle = <size_t> tl_model_ptr
+        mod_handle = <uintptr_t> tl_model_ptr
 
         return ctypes.c_void_p(mod_handle).value
 
@@ -491,6 +368,7 @@ class RandomForestRegressor(Base):
         cdef uintptr_t model_ptr = <uintptr_t> concat_model_handle
         concat_model_bytes = save_model(<ModelHandle> model_ptr)
         self.model_pbuf_bytes = concat_model_bytes
+    """
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -508,35 +386,13 @@ class RandomForestRegressor(Base):
             ndarray, cuda array interface compliant array like CuPy
             These labels should be contiguous integers from 0 to n_classes.
         """
-        self._set_output_type(X)
-
-        # Reset the old tree data for new fit call
-        self._reset_forest_data()
-
         cdef uintptr_t X_ptr, y_ptr
-
-        X_m, n_rows, self.n_cols, self.dtype = \
-            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
-                                order='F')
+        X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype)
         X_ptr = X_m.ptr
-        y_m, _, _, y_dtype = \
-            input_to_cuml_array(y,
-                                convert_to_dtype=(self.dtype if convert_dtype
-                                                  else None),
-                                check_rows=n_rows, check_cols=1)
         y_ptr = y_m.ptr
-
-        if self.dtype == np.float64:
-            warnings.warn("To use GPU-based prediction, first train using \
-                          float 32 data to fit the estimator.")
-
         cdef cumlHandle* handle_ =\
             <cumlHandle*><size_t>self.handle.getHandle()
 
-        max_feature_val = self._get_max_feat_val()
-        if type(self.min_rows_per_node) == float:
-            self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows)
-
         cdef RandomForestMetaData[float, float] *rf_forest = \
             new RandomForestMetaData[float, float]()
         self.rf_forest = <size_t> rf_forest
@@ -569,7 +425,7 @@ class RandomForestRegressor(Base):
             fit(handle_[0],
                 rf_forest,
                 <float*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <float*> y_ptr,
                 rf_params,
@@ -580,7 +436,7 @@ class RandomForestRegressor(Base):
             fit(handle_[0],
                 rf_forest64,
                 <double*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <double*> y_ptr,
                 rf_params64,
@@ -588,55 +444,10 @@ class RandomForestRegressor(Base):
         # make sure that the `fit` is complete before the following delete
         # call happens
         self.handle.sync()
-        del(X_m)
-        del(y_m)
+        del X_m
+        del y_m
         return self
 
-    def _predict_model_on_gpu(self, X, algo, convert_dtype,
-                              fil_sparse_format):
-        out_type = self._get_output_type(X)
-        cdef ModelHandle cuml_model_ptr = NULL
-        _, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='F',
-                                check_cols=self.n_cols)
-
-        if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            Please set convert_dtype=True to convert the test \
-                            data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
-                            predict_model='CPU' to use the CPU implementation \
-                            of predict.")
-
-        cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><size_t> self.rf_forest
-
-        task_category = REGRESSION_MODEL
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-
-        storage_type = \
-            _check_fil_parameter_validity(depth=self.max_depth,
-                                          fil_sparse_format=fil_sparse_format,
-                                          algo=algo)
-
-        fil_model = ForestInference()
-        tl_to_fil_model = \
-            fil_model.load_from_randomforest(treelite_handle,
-                                             output_class=False,
-                                             algo=algo,
-                                             storage_type=storage_type)
-
-        preds = tl_to_fil_model.predict(X, out_type)
-        tl.free_treelite_model(treelite_handle)
-        return preds
-
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr
@@ -744,8 +555,11 @@ class RandomForestRegressor(Base):
                             setting predict_model = 'CPU'")
 
         else:
-            preds = self._predict_model_on_gpu(X, algo, convert_dtype,
-                                               fil_sparse_format)
+            preds = self._predict_model_on_gpu(model=RandomForestRegressor,
+                                               X=X,
+                                               algo=algo,
+                                               convert_dtype=convert_dtype,
+                                               fil_sparse_format=fil_sparse_format)
 
         return preds
 
@@ -861,14 +675,8 @@ class RandomForestRegressor(Base):
         -----------
         deep : boolean (default = True)
         """
-
-        params = dict()
-        for key in RandomForestRegressor.variables:
-            if key in ['handle']:
-                continue
-            var_value = getattr(self, key, None)
-            params[key] = var_value
-        return params
+        return self._get_params(model=RandomForestRegressor,
+                                deep=deep)
 
     def set_params(self, **params):
         """
@@ -881,18 +689,8 @@ class RandomForestRegressor(Base):
         params : dict of new params
         """
         # Resetting handle as __setstate__ overwrites with handle=None
-        self.handle.__setstate__(self.n_streams)
-        self.model_pbuf_bytes = []
-
-        if not params:
-            return self
-        for key, value in params.items():
-            if key not in RandomForestRegressor.variables:
-                raise ValueError('Invalid parameter for estimator')
-            else:
-                setattr(self, key, value)
-
-        return self
+        return self._set_params(model=RandomForestRegressor,
+                                **params)
 
     def print_summary(self):
         """

From 9a9800adf5d12157b3b1cce0f6fddd1532b8439a Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Tue, 12 May 2020 20:15:46 -0500
Subject: [PATCH 02/32] update cython code

---
 python/cuml/ensemble/randomforest_common.pyx  | 135 +++++++-----------
 python/cuml/ensemble/randomforest_shared.pxd  |  37 ++++-
 .../cuml/ensemble/randomforestclassifier.pyx  |  31 +---
 .../cuml/ensemble/randomforestregressor.pyx   |  30 +---
 4 files changed, 90 insertions(+), 143 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 4ff32cae5a..54f9f48414 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -16,16 +16,25 @@
 
 import ctypes
 import cupy as cp
+import math
+import warnings
+
+import numpy as np
 from cuml import ForestInference
 from cuml.fil.fil import TreeliteModel as tl
 from cuml.common.handle import Handle
 from cuml.common.base import Base
 
 from cuml.ensemble.randomforest_shared cimport *
-from cuml.utils import input_to_cuml_array, rmm_cupy_ary
+from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 cimport cython
 
+#RandomForestMetaData[X_dtype, y_dtype] *meta
+#cdef creat_meta(X_dtype a, y_dtype b):
+#    meta = new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]()
+    #return meta
+
 
 class BaseRandomForestModel(Base):
     variables = ['n_estimators', 'max_depth', 'handle',
@@ -108,6 +117,7 @@ class BaseRandomForestModel(Base):
         self.n_streams = handle.getNumInternalStreams()
         self.seed = seed
         self.model_pbuf_bytes = bytearray()
+        self.treelite_handle = None
        # if self.model_type == curfr:
        # print have a check for the random forest meta data in init
     """
@@ -132,39 +142,53 @@ class BaseRandomForestModel(Base):
             raise ValueError("Wrong value passed in for max_features"
                              " please read the documentation")
 
-    def check_rf_metadata_type(self):
-        cdef RandomForestMetaData[float, int] *rf_forest_class
-        cdef RandomForestMetaData[double, int] *rf_forest64_class
-        cdef RandomForestMetaData[float, float] *rf_forest_reg
-        cdef RandomForestMetaData[double, double] *rf_forest64_reg
+    def _obtain_treelite_handle(self):
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
+        cdef vector[unsigned char] model_pbuf_vec
+        with cython.boundscheck(False):
+            model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
+
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        print(self.RF_type)
+        cdef cython.float a
+        cdef cython.type b
         if self.RF_type == CLASSIFICATION:
-            rf_forest_class = \
-                new RandomForestMetaData[float, int]()
-            self.rf_forest = <uintptr_t> rf_forest_class
-            rf_forest64_class = \
-                new RandomForestMetaData[double, int]()
-            self.rf_forest64 = <uintptr_t> rf_forest64_class
+            meta = create_meta(a, b)
         else:
-            rf_forest_reg = \
-                new RandomForestMetaData[float, float]()
-            self.rf_forest = <uintptr_t> rf_forest_reg
-            rf_forest64_reg = \
-                new RandomForestMetaData[double, double]()
-            self.rf_forest64 = <uintptr_t> rf_forest64_reg
-
-    def fit_setup(self, X, y, convert_dtype):
+            meta = create_meta(a, a)
+            #<RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
+        #cdef object (*meta_info)(float, int)
+        #meta_info = create_meta
+        
+        #cdef RandomForestMetaData[cython.typeof(a), cython.typeof(b)] *meta_info = get_meta_data[cython.typeof(a), cython.typeof(b)](<void*> self.rf_forest)
+            #<RandomForestMetaData[cython.typeof(a), cython.typeof(b)]><uintptr_t> self.forest #get_meta_data[float, int](<uintptr_t> self.rf_forest)
+        #cdef RandomForestMetaData[float, int] *rf_forest
+        #cdef fused_rf_meta *forest = \
+        #    <fused_rf_meta[cython.float, cython.int]*><uintptr_t>self.rf_forest
+        if self.treelite_handle is None:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                meta,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+            mod_ptr = <uintptr_t> cuml_model_ptr
+            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+          
+        return self.treelite_handle
+
+    def _dataset_setup(self, X, y, convert_dtype):
         self._set_output_type(X)
 
         # Reset the old tree data for new fit call
         self._reset_forest_data()
 
-        #cdef uintptr_t X_ptr, y_ptr
-
         X_m, self.n_rows, self.n_cols, self.dtype = \
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
                                 order='F')
-        X_ptr = X_m.ptr
-        print(" type pf X_ptr in common : ", type(X_ptr))
         if self.RF_type == CLASSIFICATION:
             y_m, _, _, y_dtype = \
                 input_to_cuml_array(y, check_dtype=np.int32,
@@ -186,7 +210,6 @@ class BaseRandomForestModel(Base):
                                     convert_to_dtype=(self.dtype if convert_dtype
                                                       else None),
                                     check_rows=self.n_rows, check_cols=1)
-        y_ptr = y_m.ptr
 
         if self.dtype == np.float64:
             warnings.warn("To use GPU-based prediction, first train using \
@@ -196,44 +219,6 @@ class BaseRandomForestModel(Base):
         if type(self.min_rows_per_node) == float:
             self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows)
 
-        """
-        cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest
-        cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest64
-        if self.RF_type == CLASSIFICATION:
-            *rf_forest = \
-                new RandomForestMetaData[float, int]()
-            self.rf_forest = <uintptr_t> rf_forest
-            *rf_forest64 = \
-                new RandomForestMetaData[double, int]()
-            self.rf_forest64 = <uintptr_t> rf_forest64
-        else:
-            *rf_forest = \
-                new RandomForestMetaData[float, float]()
-            self.rf_forest = <uintptr_t> rf_forest
-            *rf_forest64 = \
-                new RandomForestMetaData[double, double]()
-            self.rf_forest64 = <uintptr_t> rf_forest64
-
-        if self.dtype == np.float32:
-            fit(handle_[0],
-                rf_forest,
-                <float*> X_ptr,
-                <int> self.n_rows,
-                <int> self.n_cols,
-                <float*> y_ptr,
-                rf_params,
-                <int> self.verbosity)
-        else:
-            rf_params64 = rf_params
-            fit(handle_[0],
-                rf_forest64,
-                <double*> X_ptr,
-                <int> self.n_rows,
-                <int> self.n_cols,
-                <double*> y_ptr,
-                rf_params64,
-                <int> self.verbosity)
-        """
         return X_m, y_m, max_feature_val
 
     def _predict_model_on_gpu(self, model, X, algo, convert_dtype,
@@ -254,7 +239,7 @@ class BaseRandomForestModel(Base):
                             predict_model='CPU' to use the CPU implementation \
                             of predict.")
 
-        model._obtain_treelite_handle()
+        self._obtain_treelite_handle()
         storage_type = \
             _check_fil_parameter_validity(depth=self.max_depth,
                                           fil_sparse_format=fil_sparse_format,
@@ -295,28 +280,6 @@ class BaseRandomForestModel(Base):
                 setattr(self, key, value)
         return self
 
-    """
-    def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef rf_class_float *rf_forest_class
-        cdef rf_reg_float *rf_forest_reg
-        if task_category == CLASSIFICATION:
-            rf_forest_class = \
-                <rf_meta_type*><uintptr_t> self.rf_forest
-
-        else:
-            rf_forest_reg = \
-                <rf_meta_type*><uintptr_t> self.rf_forest
-        build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr,
-                                  rf_forest_reg,
-                                  <int> self.n_cols,
-                                  <int> task_category,
-                                  <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return treelite_handle
-
-    """
     def _get_protobuf_bytes_common(self, model):
         fit_mod_ptr = model._obtain_treelite_handle()
         cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index 976c85e7d0..e7074e91e9 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -33,7 +33,7 @@ from cuml.common.handle import Handle
 from cuml import ForestInference
 from cuml.common.base import Base
 from cuml.common.handle cimport cumlHandle
-from cuml.common import get_cudf_column_ptr, get_dev_array_ptr, \
+from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \
     input_to_dev_array, zeros
 cimport cuml.common.handle
 cimport cuml.common.cuda
@@ -47,7 +47,7 @@ cdef extern from "treelite/c_api.h":
                                          ModelHandle model)
     cdef const char* TreeliteGetLastError()
 
-cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
+cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
     cdef enum CRITERION:
         GINI,
         ENTROPY,
@@ -55,7 +55,15 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
         MAE,
         CRITERION_END
 
-cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree":
+cdef extern from "cuml/tree/flatnode.h" namespace "ML::Flatnode" nogil:
+    cdef cppclass SparseTreeNode[T, L]:
+        L prediction
+        int colid
+        T quesval
+        T best_metric_val
+        int left_child_id
+
+cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil:
     cdef struct DecisionTreeParams:
         int max_depth
         int max_leaves
@@ -67,7 +75,15 @@ cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree":
         bool quantile_per_tree
         CRITERION split_criterion
 
-cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
+    cdef cppclass TreeMetaDataNode[T, L]:
+        int treeid
+        int depth_counter
+        int leaf_counter
+        double prepare_time
+        double train_time
+        vector[SparseTreeNode[T, L]] sparsetree
+
+cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
 
     cdef enum RF_type:
         CLASSIFICATION,
@@ -92,18 +108,29 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
         pass
 
     cdef cppclass RandomForestMetaData[T, L]:
-        void* trees
+        ctypedef TreeMetaDataNode[T, L]* trees
         RF_params rf_params
 
+
+    ctypedef fused fused_rf_meta:
+       RandomForestMetaData[float, float]
+       RandomForestMetaData[double, double]
+       RandomForestMetaData[float, int]
+       RandomForestMetaData[double, int]
+
+    cdef fused_rf_meta *meta
+
     #
     # Treelite handling
     #
+
     cdef void build_treelite_forest[T, L](ModelHandle*,
                                           RandomForestMetaData[T, L]*,
                                           int,
                                           int,
                                           vector[unsigned char] &) except +
 
+
     cdef vector[unsigned char] save_model_protobuf(ModelHandle) except +
 
     cdef void print_rf_summary[T, L](RandomForestMetaData[T, L]*) except +
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index b5b56f322f..3dbb0cedf5 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -309,31 +309,6 @@ class RandomForestClassifier(BaseRandomForestModel):
             free(<RandomForestMetaData[double, int]*><uintptr_t>
                  self.rf_forest64)
 
-    def _obtain_treelite_handle(self):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-        if self.num_classes > 2:
-            raise NotImplementedError("Pickling for multi-class "
-                                      "classification models is currently not "
-                                      "implemented. Please check cuml issue "
-                                      "#1679 for more information.")
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-        if self.treelite_handle is None:
-            build_treelite_forest(
-                & cuml_model_ptr,
-                rf_forest,
-                <int> self.n_cols,
-                <int> self.num_classes,
-                model_pbuf_vec)
-            mod_ptr = <uintptr_t> cuml_model_ptr
-            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return self.treelite_handle
-
     def _get_protobuf_bytes(self):
         """
         Returns the self.model_pbuf_bytes.
@@ -488,7 +463,7 @@ class RandomForestClassifier(BaseRandomForestModel):
 
         """
         cdef uintptr_t X_ptr, y_ptr
-        X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype)
+        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
         X_ptr = X_m.ptr
         y_ptr = y_m.ptr
         cdef cumlHandle* handle_ =\
@@ -828,10 +803,10 @@ class RandomForestClassifier(BaseRandomForestModel):
                                       "implemented. Please check cuml issue "
                                       "#1679 for more information.")
         preds_proba = \
-            self._predict_model_on_gpu(X, output_class=output_class,
+            self._predict_model_on_gpu(model=RandomForestClassifier,
+                                       X=X, output_class=output_class,
                                        threshold=threshold,
                                        algo=algo,
-                                       num_classes=num_classes,
                                        convert_dtype=convert_dtype,
                                        fil_sparse_format=fil_sparse_format,
                                        predict_proba=True)
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index feae6d5b87..6fa4090e6a 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -21,7 +21,6 @@
 
 import ctypes
 import cudf
-import math
 import numpy as np
 import warnings
 
@@ -41,8 +40,7 @@ from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
 
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.fil.fil import TreeliteModel as tl
-from cuml.common import input_to_cuml_array, input_to_dev_array, \
-    zeros, get_cudf_column_ptr
+from cuml.common import input_to_cuml_array
 from cython.operator cimport dereference as deref
 
 from numba import cuda
@@ -291,27 +289,11 @@ class RandomForestRegressor(BaseRandomForestModel):
                  self.rf_forest)
             free(<RandomForestMetaData[double, double]*><uintptr_t>
                  self.rf_forest64)
-
+    """
     def _obtain_treelite_handle(self):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-        if self.treelite_handle is None:
-            task_category = REGRESSION_MODEL
-            build_treelite_forest(
-                & cuml_model_ptr,
-                rf_forest,
-                <int> self.n_cols,
-                <int> task_category,
-                model_pbuf_vec)
-            mod_ptr = <uintptr_t> cuml_model_ptr
-            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return self.treelite_handle
+        cdef RandomForestMetaData[float, float] rf_forest 
+        self._get_treelite_handle(rf_forest)
+    """
 
     def _get_protobuf_bytes(self):
         """
@@ -411,7 +393,7 @@ class RandomForestRegressor(BaseRandomForestModel):
             These labels should be contiguous integers from 0 to n_classes.
         """
         cdef uintptr_t X_ptr, y_ptr
-        X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype)
+        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
         X_ptr = X_m.ptr
         y_ptr = y_m.ptr
         cdef cumlHandle* handle_ =\

From f229b79a4da7b146c4e7a021cf7635be6cff9d36 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Tue, 12 May 2020 20:25:46 -0500
Subject: [PATCH 03/32] update common file

---
 python/cuml/ensemble/randomforest_common.pyx | 31 ++++++++++++++++----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 54f9f48414..7d35af96a3 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -153,13 +153,10 @@ class BaseRandomForestModel(Base):
         mod_ptr = <uintptr_t> cuml_model_ptr
         self.treelite_handle = ctypes.c_void_p(mod_ptr).value
         print(self.RF_type)
-        cdef cython.float a
-        cdef cython.type b
         if self.RF_type == CLASSIFICATION:
-            meta = create_meta(a, b)
+            meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
         else:
-            meta = create_meta(a, a)
-            #<RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
+            meta = <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
         #cdef object (*meta_info)(float, int)
         #meta_info = create_meta
         
@@ -186,6 +183,8 @@ class BaseRandomForestModel(Base):
         # Reset the old tree data for new fit call
         self._reset_forest_data()
 
+        #cdef uintptr_t X_ptr, y_ptr
+
         X_m, self.n_rows, self.n_cols, self.dtype = \
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
                                 order='F')
@@ -280,6 +279,28 @@ class BaseRandomForestModel(Base):
                 setattr(self, key, value)
         return self
 
+    """
+    def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type):
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef rf_class_float *rf_forest_class
+        cdef rf_reg_float *rf_forest_reg
+        if task_category == CLASSIFICATION:
+            rf_forest_class = \
+                <rf_meta_type*><uintptr_t> self.rf_forest
+
+        else:
+            rf_forest_reg = \
+                <rf_meta_type*><uintptr_t> self.rf_forest
+        build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr,
+                                  rf_forest_reg,
+                                  <int> self.n_cols,
+                                  <int> task_category,
+                                  <vector[unsigned char] &> self.model_pbuf_bytes)
+        mod_ptr = <size_t> cuml_model_ptr
+        treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return treelite_handle
+
+    """
     def _get_protobuf_bytes_common(self, model):
         fit_mod_ptr = model._obtain_treelite_handle()
         cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr

From a77a468b4c815ccd4140423730a47a300367e293 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 13 May 2020 07:28:56 -0500
Subject: [PATCH 04/32] ypdate predict func

---
 cpp/include/cuml/ensemble/randomforest.hpp    |  16 +--
 cpp/src/randomforest/randomforest.cu          |  25 ++--
 python/cuml/ensemble/randomforest_common.pyx  | 108 +++++++++++++++++-
 python/cuml/ensemble/randomforest_shared.pxd  |   7 ++
 .../cuml/ensemble/randomforestclassifier.pyx  |  78 ++-----------
 .../cuml/ensemble/randomforestregressor.pyx   |  77 ++-----------
 6 files changed, 154 insertions(+), 157 deletions(-)

diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index 8f8b04a5d2..f3ddcec3ad 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -144,14 +144,17 @@ void fit(const cumlHandle& user_handle, RandomForestClassifierD*& forest,
          int n_unique_labels, RF_params rf_params,
          int verbosity = CUML_LEVEL_INFO);
 
+template <class T, class L>
 void predict(const cumlHandle& user_handle,
-             const RandomForestClassifierF* forest, const float* input,
-             int n_rows, int n_cols, int* predictions,
+             const RandomForestMetaData<T, L>* forest, const T* input,
+             int n_rows, int n_cols, L* predictions,
              int verbosity = CUML_LEVEL_INFO);
+/**
 void predict(const cumlHandle& user_handle,
              const RandomForestClassifierD* forest, const double* input,
              int n_rows, int n_cols, int* predictions,
              int verbosity = CUML_LEVEL_INFO);
+*/
 
 void predictGetAll(const cumlHandle& user_handle,
                    const RandomForestClassifierF* forest, const float* input,
@@ -190,13 +193,10 @@ void fit(const cumlHandle& user_handle, RandomForestRegressorD*& forest,
          double* input, int n_rows, int n_cols, double* labels,
          RF_params rf_params, int verbosity = CUML_LEVEL_INFO);
 
+template <class T>
 void predict(const cumlHandle& user_handle,
-             const RandomForestRegressorF* forest, const float* input,
-             int n_rows, int n_cols, float* predictions,
-             int verbosity = CUML_LEVEL_INFO);
-void predict(const cumlHandle& user_handle,
-             const RandomForestRegressorD* forest, const double* input,
-             int n_rows, int n_cols, double* predictions,
+             const RandomForestMetaData<T, T>* forest, const T* input,
+             int n_rows, int n_cols, T* predictions,
              int verbosity = CUML_LEVEL_INFO);
 
 RF_metrics score(const cumlHandle& user_handle,
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 4287e40a96..ee3ffea291 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -526,16 +526,18 @@ void fit(const cumlHandle& user_handle, RandomForestClassifierD*& forest,
  * @param[in] verbosity: verbosity level for logging messages during execution
  * @{
  */
+template <class T, class L>
 void predict(const cumlHandle& user_handle,
-             const RandomForestClassifierF* forest, const float* input,
-             int n_rows, int n_cols, int* predictions, int verbosity) {
+             const RandomForestMetaData<T, L>* forest, const T* input,
+             int n_rows, int n_cols, L* predictions, int verbosity) {
   ASSERT(forest->trees, "Cannot predict! No trees in the forest.");
-  std::shared_ptr<rfClassifier<float>> rf_classifier =
-    std::make_shared<rfClassifier<float>>(forest->rf_params);
+  std::shared_ptr<rfClassifier<T>> rf_classifier =
+    std::make_shared<rfClassifier<T>>(forest->rf_params);
   rf_classifier->predict(user_handle, input, n_rows, n_cols, predictions,
                          forest, verbosity);
 }
 
+/**
 void predict(const cumlHandle& user_handle,
              const RandomForestClassifierD* forest, const double* input,
              int n_rows, int n_cols, int* predictions, int verbosity) {
@@ -545,6 +547,7 @@ void predict(const cumlHandle& user_handle,
   rf_classifier->predict(user_handle, input, n_rows, n_cols, predictions,
                          forest, verbosity);
 }
+*/
 /** @} */
 
 /**
@@ -689,16 +692,18 @@ void fit(const cumlHandle& user_handle, RandomForestRegressorD*& forest,
  * @param[in] verbosity: verbosity level for logging messages during execution
  * @{
  */
+template <class T>
 void predict(const cumlHandle& user_handle,
-             const RandomForestRegressorF* forest, const float* input,
-             int n_rows, int n_cols, float* predictions, int verbosity) {
+             const RandomForestMetaData<T, T>* forest, const T* input,
+             int n_rows, int n_cols, T* predictions, int verbosity) {
   ASSERT(forest->trees, "Cannot predict! No trees in the forest.");
-  std::shared_ptr<rfRegressor<float>> rf_regressor =
-    std::make_shared<rfRegressor<float>>(forest->rf_params);
+  std::shared_ptr<rfRegressor<T>> rf_regressor =
+    std::make_shared<rfRegressor<T>>(forest->rf_params);
   rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, forest,
                         verbosity);
 }
 
+/**
 void predict(const cumlHandle& user_handle,
              const RandomForestRegressorD* forest, const double* input,
              int n_rows, int n_cols, double* predictions, int verbosity) {
@@ -708,6 +713,9 @@ void predict(const cumlHandle& user_handle,
   rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, forest,
                         verbosity);
 }
+
+*/
+
 /** @} */
 
 /**
@@ -742,6 +750,7 @@ RF_metrics score(const cumlHandle& user_handle,
     user_handle, ref_labels, n_rows, predictions, verbosity);
   return regression_score;
 }
+
 /** @} */
 
 // Functions' specializations
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 7d35af96a3..a8ecdef6fb 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -24,6 +24,9 @@ from cuml import ForestInference
 from cuml.fil.fil import TreeliteModel as tl
 from cuml.common.handle import Handle
 from cuml.common.base import Base
+from cuml.common.array import CumlArray
+
+from cython.operator cimport dereference as deref
 
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.common import input_to_cuml_array, rmm_cupy_ary
@@ -152,11 +155,8 @@ class BaseRandomForestModel(Base):
 
         mod_ptr = <uintptr_t> cuml_model_ptr
         self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        print(self.RF_type)
-        if self.RF_type == CLASSIFICATION:
-            meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-        else:
-            meta = <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
+        meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
+
         #cdef object (*meta_info)(float, int)
         #meta_info = create_meta
         
@@ -220,6 +220,56 @@ class BaseRandomForestModel(Base):
 
         return X_m, y_m, max_feature_val
 
+    def _get_protobuf_bytes(self):
+        """
+        Returns the self.model_pbuf_bytes.
+        Cuml RF model gets converted to treelite protobuf bytes by:
+            1. converting the cuml RF model to a treelite model. The treelite
+            models handle (pointer) is returned
+            2. The treelite model handle is used to convert the treelite model
+            to a treelite protobuf model which is stored in a temporary file.
+            The protobuf model information is read from the temporary file and
+            the byte information is returned.
+        The treelite handle is stored `self.treelite_handle` and the treelite
+        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
+        of information is already present in the model then the respective
+        step is skipped.
+        """
+        if self.model_pbuf_bytes:
+            return self.model_pbuf_bytes
+        elif self.treelite_handle:
+            fit_mod_ptr = self.treelite_handle
+        else:
+            fit_mod_ptr = self._obtain_treelite_handle()
+        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self.model_pbuf_bytes
+
+    def _concatenate_treelite_handle(self, treelite_handle):
+        cdef ModelHandle concat_model_handle = NULL
+        cdef vector[ModelHandle] *model_handles \
+            = new vector[ModelHandle]()
+        cdef uintptr_t mod_ptr
+        for i in treelite_handle:
+            mod_ptr = <uintptr_t>i
+            model_handles.push_back((
+                <ModelHandle> mod_ptr))
+
+        concat_model_handle = concatenate_trees(deref(model_handles))
+
+        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
+        self.treelite_handle = concat_model_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> concat_model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self
+
     def _predict_model_on_gpu(self, model, X, algo, convert_dtype,
                               fil_sparse_format, threshold=0.5,
                               output_class=False, predict_proba=False):
@@ -257,6 +307,54 @@ class BaseRandomForestModel(Base):
         tl.free_treelite_model(self.treelite_handle)
         return preds
 
+    def _predict_model_on_cpu(self, X, convert_dtype):
+        out_type = self._get_output_type(X)
+        cdef uintptr_t X_ptr
+        X_m, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='C',
+                                convert_to_dtype=(self.dtype if convert_dtype
+                                                  else None),
+                                check_cols=self.n_cols)
+        X_ptr = X_m.ptr
+
+        preds = CumlArray.zeros(n_rows, dtype=dtype)
+        cdef uintptr_t preds_ptr = preds.ptr
+
+        cdef cumlHandle* handle_ =\
+            <cumlHandle*><uintptr_t>self.handle.getHandle()
+
+        cdef RandomForestMetaData[float, int] *rf_forest = \
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
+
+        cdef RandomForestMetaData[double, int] *rf_forest64 = \
+            <RandomForestMetaData[double, int]*><uintptr_t> self.rf_forest64
+        if self.dtype == np.float32:
+            predict(handle_[0],
+                    rf_forest,
+                    <float*> X_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <float*> preds_ptr,
+                    <int> self.verbosity)
+
+        elif self.dtype == np.float64:
+            predict(handle_[0],
+                    rf_forest64,
+                    <double*> X_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <double*> preds_ptr,
+                    <int> self.verbosity)
+        else:
+            raise TypeError("supports only float32 and float64 input,"
+                            " but input of type '%s' passed."
+                            % (str(self.dtype)))
+
+        self.handle.sync()
+        # synchronous w/o a stream
+        del(X_m)
+        return preds.to_output(out_type)
+
     def _get_params(self, model, deep):
         params = dict()
         for key in model.variables:
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index e7074e91e9..d9270a3ead 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -157,3 +157,10 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
     cdef ModelHandle concatenate_trees(
         vector[ModelHandle] &treelite_handles) except +
 
+    cdef void predict[T, L](cumlHandle& handle,
+                      RandomForestMetaData[T, L] *,
+                      T*,
+                      int,
+                      int,
+                      L*,
+                      int) except +
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 4f5c9f0e56..5a98251932 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -33,8 +33,6 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
 
-from cython.operator cimport dereference as deref
-
 from cuml import ForestInference
 from cuml.common.array import CumlArray
 from cuml.common.base import Base
@@ -77,21 +75,13 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   RF_params,
                   int) except +
 
-    cdef void predict(cumlHandle& handle,
-                      RandomForestMetaData[float, int] *,
-                      float*,
-                      int,
-                      int,
-                      int*,
-                      bool) except +
-
-    cdef void predict(cumlHandle& handle,
-                      RandomForestMetaData[double, int]*,
-                      double*,
-                      int,
-                      int,
-                      int*,
-                      bool) except +
+    cdef void predict[T, L](cumlHandle& handle,
+                            RandomForestMetaData[T, L] *,
+                            T*,
+                            int,
+                            int,
+                            L*,
+                            int) except +
 
     cdef void predictGetAll(cumlHandle& handle,
                             RandomForestMetaData[float, int] *,
@@ -308,37 +298,6 @@ class RandomForestClassifier(BaseRandomForestModel):
                  self.rf_forest)
             free(<RandomForestMetaData[double, int]*><uintptr_t>
                  self.rf_forest64)
-            self.treelite_handle = None
-            self.model_pbuf_bytes = bytearray()
-
-    def _get_protobuf_bytes(self):
-        """
-        Returns the self.model_pbuf_bytes.
-        Cuml RF model gets converted to treelite protobuf bytes by:
-            1. converting the cuml RF model to a treelite model. The treelite
-            models handle (pointer) is returned
-            2. The treelite model handle is used to convert the treelite model
-            to a treelite protobuf model which is stored in a temporary file.
-            The protobuf model information is read from the temporary file and
-            the byte information is returned.
-        The treelite handle is stored `self.treelite_handle` and the treelite
-        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
-        of information is already present in the model then the respective
-        step is skipped.
-        """
-        if self.model_pbuf_bytes:
-            return self.model_pbuf_bytes
-        elif self.treelite_handle:
-            fit_mod_ptr = self.treelite_handle
-        else:
-            fit_mod_ptr = self._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self.model_pbuf_bytes
 
     def convert_to_treelite_model(self):
         """
@@ -423,26 +382,6 @@ class RandomForestClassifier(BaseRandomForestModel):
 
         return ctypes.c_void_p(mod_handle).value
 
-    def _concatenate_treelite_handle(self, treelite_handle):
-        cdef ModelHandle concat_model_handle = NULL
-        cdef vector[ModelHandle] *model_handles \
-            = new vector[ModelHandle]()
-        cdef uintptr_t mod_ptr
-        for i in treelite_handle:
-            mod_ptr = <uintptr_t>i
-            model_handles.push_back((
-                <ModelHandle> mod_ptr))
-
-        concat_model_handle = concatenate_trees(deref(model_handles))
-        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
-        self.treelite_handle = concat_model_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> concat_model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self
-
     def fit(self, X, y, convert_dtype=False):
         """
         Perform Random Forest Classification on the input data
@@ -532,7 +471,7 @@ class RandomForestClassifier(BaseRandomForestModel):
         del X_m
         del y_m
         return self
-
+    """
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr
@@ -580,6 +519,7 @@ class RandomForestClassifier(BaseRandomForestModel):
         # synchronous w/o a stream
         del(X_m)
         return preds.to_output(out_type)
+        """
 
     def predict(self, X, predict_model="GPU",
                 output_class=True, threshold=0.5,
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 46d43c20a4..b356e8d901 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -68,22 +68,14 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   double*,
                   RF_params,
                   int) except +
-    
-    cdef void predict(cumlHandle& handle,
-                      RandomForestMetaData[float, float] *,
-                      float*,
-                      int,
-                      int,
-                      float*,
-                      int) except +
-
-    cdef void predict(cumlHandle& handle,
-                      RandomForestMetaData[double, double]*,
-                      double*,
-                      int,
-                      int,
-                      double*,
-                      int) except +
+
+    cdef void predict[T, T](cumlHandle& handle,
+                            RandomForestMetaData[T, T] *,
+                            T*,
+                            int,
+                            int,
+                            T*,
+                            int) except +
 
     cdef RF_metrics score(cumlHandle& handle,
                           RandomForestMetaData[float, float]*,
@@ -292,34 +284,6 @@ class RandomForestRegressor(BaseRandomForestModel):
             self.treelite_handle = None
             self.model_pbuf_bytes = bytearray()
 
-    def _get_protobuf_bytes(self):
-        """
-        Returns the self.model_pbuf_bytes.
-        Cuml RF model gets converted to treelite protobuf bytes by:
-            1. converting the cuml RF model to a treelite model. The treelite
-            models handle (pointer) is returned
-            2. The treelite model handle is used to convert the treelite model
-            to a treelite protobuf model which is stored in a temporary file.
-            The protobuf model information is read from the temporary file and
-            the byte information is returned.
-        The treelite handle is stored `self.treelite_handle` and the treelite
-        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
-        of information is already present in the model then the respective
-        step is skipped.
-        """
-        if self.model_pbuf_bytes:
-            return self.model_pbuf_bytes
-        elif self.treelite_handle:
-            fit_mod_ptr = self.treelite_handle
-        else:
-            fit_mod_ptr = self._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self.model_pbuf_bytes
 
     def convert_to_treelite_model(self):
         """
@@ -352,27 +316,6 @@ class RandomForestRegressor(BaseRandomForestModel):
 
         return ctypes.c_void_p(mod_handle).value
 
-    def _concatenate_treelite_handle(self, treelite_handle):
-        cdef ModelHandle concat_model_handle = NULL
-        cdef vector[ModelHandle] *model_handles \
-            = new vector[ModelHandle]()
-        cdef uintptr_t mod_ptr
-        for i in treelite_handle:
-            mod_ptr = <uintptr_t>i
-            model_handles.push_back((
-                <ModelHandle> mod_ptr))
-
-        concat_model_handle = concatenate_trees(deref(model_handles))
-
-        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
-        self.treelite_handle = concat_model_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> concat_model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self
-
     def fit(self, X, y, convert_dtype=False):
         """
         Perform Random Forest Regression on the input data
@@ -461,7 +404,7 @@ class RandomForestRegressor(BaseRandomForestModel):
                                 check_cols=self.n_cols)
         X_ptr = X_m.ptr
 
-        preds = CumlArray.zeros(n_rows, dtype=dtype)
+        preds = CumlArray.zeros(n_rows, dtype=np.int32)
         cdef uintptr_t preds_ptr = preds.ptr
 
         cdef cumlHandle* handle_ =\
@@ -490,7 +433,7 @@ class RandomForestRegressor(BaseRandomForestModel):
                     <double*> preds_ptr,
                     <int> self.verbosity)
         else:
-            raise TypeError("supports only float32 and float64 input,"
+            raise TypeError("supports only np.float32 and np.float64 input,"
                             " but input of type '%s' passed."
                             % (str(self.dtype)))
 

From f4ddf130636b6a7300ea51c46c030da164ce3af0 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Thu, 14 May 2020 08:08:07 -0500
Subject: [PATCH 05/32] created cdef class to assign rfmetadata

---
 cpp/include/cuml/ensemble/randomforest.hpp    |   3 +
 cpp/src/randomforest/randomforest.cu          |  30 ++++
 python/cuml/ensemble/randomforest_common.pyx  | 170 +++++++-----------
 python/cuml/ensemble/randomforest_shared.pxd  |  26 +--
 .../cuml/ensemble/randomforestregressor.pyx   |  38 ++--
 5 files changed, 135 insertions(+), 132 deletions(-)

diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index f3ddcec3ad..e2cb4e1138 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -131,6 +131,9 @@ ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles);
 
 void compare_concat_forest_to_subforests(
   ModelHandle concat_tree_handle, std::vector<ModelHandle> treelite_handles);
+
+template <class T, class L>
+RandomForestMetaData<T, L>* create_meta(T a, L b);
 // ----------------------------- Classification ----------------------------------- //
 
 typedef RandomForestMetaData<float, int> RandomForestClassifierF;
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index ee3ffea291..99beb93d10 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -220,6 +220,11 @@ void print(const RF_params rf_params) {
   DecisionTree::print(rf_params.tree_params);
 }
 
+template <class T, class L>
+RandomForestMetaData<T, L>* create_meta(T a, L b) {
+  RandomForestMetaData<T, L>* rf = new RandomForestMetaData<T, L>();
+  return rf;
+}
 /**
  * @brief Set the trees pointer of RandomForestMetaData to nullptr.
  * @param[in, out] forest: CPU pointer to RandomForestMetaData.
@@ -754,6 +759,31 @@ RF_metrics score(const cumlHandle& user_handle,
 /** @} */
 
 // Functions' specializations
+
+template RandomForestMetaData<float, float>* create_meta<float, float>(float a, float b);
+template RandomForestMetaData<double, double>* create_meta<double, double>(double a, double b);
+template RandomForestMetaData<float, int>* create_meta<float, int>(float a , int b);
+template RandomForestMetaData<double, int>* create_meta<double, int>(double a, int b);
+
+
+template void predict<double, int>(
+  const cumlHandle& user_handle,
+  const RandomForestMetaData<double, int>* forest, const double* input,
+  int n_rows, int n_cols, int* predictions, int verbosity);
+template void predict<float, int>(
+  const cumlHandle& user_handle,
+  const RandomForestMetaData<float, int>* forest, const float* input,
+  int n_rows, int n_cols, int* predictions, int verbosity);
+
+template void predict<float>(
+  const cumlHandle& user_handle,
+  const RandomForestMetaData<float, float>* forest, const float* input,
+  int n_rows, int n_cols, float* predictions, int verbosity);
+template void predict<double>(
+  const cumlHandle& user_handle,
+  const RandomForestMetaData<double, double>* forest, const double* input,
+  int n_rows, int n_cols, double* predictions, int verbosity);
+
 template void print_rf_summary<float, int>(
   const RandomForestClassifierF* forest);
 template void print_rf_summary<double, int>(
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index a8ecdef6fb..ee8dcb78e3 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -33,10 +33,56 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 cimport cython
 
-#RandomForestMetaData[X_dtype, y_dtype] *meta
-#cdef creat_meta(X_dtype a, y_dtype b):
-#    meta = new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]()
-    #return meta
+
+cdef class BaseRandomForestModel_impl():
+    cpdef creat_meta(self, X_dtype a, y_dtype b):
+        meta = \
+            new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]()
+        cdef RandomForestMetaData[float, int] *rf_class
+        cdef RandomForestMetaData[float, float] *rf_reg
+        if cython.typeof(b) == cython.int:
+            rf_class = <RandomForestMetaData[float, int]*><size_t> self.rf_forest
+            meta = rf_class
+        else:
+            rf_reg = <RandomForestMetaData[float, float]*><size_t> self.rf_forest
+            meta = rf_reg
+        
+    def _obtain_treelite_handle(self):
+        if self.treelite_handle:
+            return self.treelite_handle
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
+        cdef vector[unsigned char] model_pbuf_vec
+        with cython.boundscheck(False):
+            model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
+
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        cdef cython.float a = 10.0
+        cdef cython.int b = 5
+        base_rf = BaseRandomForestModel_impl()
+        base_rf.creat_meta(a, b)
+        if self.RF_type == CLASSIFICATION:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                #<RandomForestMetaData[float, int]*><size_t> self.rf_forest,
+                meta,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+        else:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                #<RandomForestMetaData[float, float]*><size_t> self.rf_forest,
+                meta,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+            mod_ptr = <uintptr_t> cuml_model_ptr
+            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+          
+        return self.treelite_handle
 
 
 class BaseRandomForestModel(Base):
@@ -48,6 +94,9 @@ class BaseRandomForestModel(Base):
                  'verbose', 'rows_sample',
                  'max_leaves', 'quantile_per_tree']
 
+    def __init__(self):
+        self._impl = BaseRandomForestModel_impl()
+
     def _create_model(self, model, seed, split_criterion,
                       n_streams, n_estimators=100,
                       max_depth=16, handle=None, max_features='auto',
@@ -146,36 +195,8 @@ class BaseRandomForestModel(Base):
                              " please read the documentation")
 
     def _obtain_treelite_handle(self):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-
-        mod_ptr = <uintptr_t> cuml_model_ptr
-        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-
-        #cdef object (*meta_info)(float, int)
-        #meta_info = create_meta
-        
-        #cdef RandomForestMetaData[cython.typeof(a), cython.typeof(b)] *meta_info = get_meta_data[cython.typeof(a), cython.typeof(b)](<void*> self.rf_forest)
-            #<RandomForestMetaData[cython.typeof(a), cython.typeof(b)]><uintptr_t> self.forest #get_meta_data[float, int](<uintptr_t> self.rf_forest)
-        #cdef RandomForestMetaData[float, int] *rf_forest
-        #cdef fused_rf_meta *forest = \
-        #    <fused_rf_meta[cython.float, cython.int]*><uintptr_t>self.rf_forest
-        if self.treelite_handle is None:
-            build_treelite_forest(
-                & cuml_model_ptr,
-                meta,
-                <int> self.n_cols,
-                <int> self.num_classes,
-                model_pbuf_vec)
-            mod_ptr = <uintptr_t> cuml_model_ptr
-            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
           
-        return self.treelite_handle
+        return self._impl._obtain_treelite_handle
 
     def _dataset_setup(self, X, y, convert_dtype):
         self._set_output_type(X)
@@ -249,6 +270,19 @@ class BaseRandomForestModel(Base):
         self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
         return self.model_pbuf_bytes
 
+    def _tl_model_handles(self, model_bytes):
+        cdef ModelHandle cuml_model_ptr = NULL
+        meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
+        task_category = CLASSIFICATION_MODEL
+        build_treelite_forest(& cuml_model_ptr,
+                              meta,
+                              <int> self.n_cols,
+                              <int> task_category,
+                              <vector[unsigned char] &> model_bytes)
+        mod_handle = <uintptr_t> cuml_model_ptr
+
+        return ctypes.c_void_p(mod_handle).value
+
     def _concatenate_treelite_handle(self, treelite_handle):
         cdef ModelHandle concat_model_handle = NULL
         cdef vector[ModelHandle] *model_handles \
@@ -307,54 +341,6 @@ class BaseRandomForestModel(Base):
         tl.free_treelite_model(self.treelite_handle)
         return preds
 
-    def _predict_model_on_cpu(self, X, convert_dtype):
-        out_type = self._get_output_type(X)
-        cdef uintptr_t X_ptr
-        X_m, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='C',
-                                convert_to_dtype=(self.dtype if convert_dtype
-                                                  else None),
-                                check_cols=self.n_cols)
-        X_ptr = X_m.ptr
-
-        preds = CumlArray.zeros(n_rows, dtype=dtype)
-        cdef uintptr_t preds_ptr = preds.ptr
-
-        cdef cumlHandle* handle_ =\
-            <cumlHandle*><uintptr_t>self.handle.getHandle()
-
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-
-        cdef RandomForestMetaData[double, int] *rf_forest64 = \
-            <RandomForestMetaData[double, int]*><uintptr_t> self.rf_forest64
-        if self.dtype == np.float32:
-            predict(handle_[0],
-                    rf_forest,
-                    <float*> X_ptr,
-                    <int> n_rows,
-                    <int> n_cols,
-                    <float*> preds_ptr,
-                    <int> self.verbosity)
-
-        elif self.dtype == np.float64:
-            predict(handle_[0],
-                    rf_forest64,
-                    <double*> X_ptr,
-                    <int> n_rows,
-                    <int> n_cols,
-                    <double*> preds_ptr,
-                    <int> self.verbosity)
-        else:
-            raise TypeError("supports only float32 and float64 input,"
-                            " but input of type '%s' passed."
-                            % (str(self.dtype)))
-
-        self.handle.sync()
-        # synchronous w/o a stream
-        del(X_m)
-        return preds.to_output(out_type)
-
     def _get_params(self, model, deep):
         params = dict()
         for key in model.variables:
@@ -377,28 +363,6 @@ class BaseRandomForestModel(Base):
                 setattr(self, key, value)
         return self
 
-    """
-    def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef rf_class_float *rf_forest_class
-        cdef rf_reg_float *rf_forest_reg
-        if task_category == CLASSIFICATION:
-            rf_forest_class = \
-                <rf_meta_type*><uintptr_t> self.rf_forest
-
-        else:
-            rf_forest_reg = \
-                <rf_meta_type*><uintptr_t> self.rf_forest
-        build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr,
-                                  rf_forest_reg,
-                                  <int> self.n_cols,
-                                  <int> task_category,
-                                  <vector[unsigned char] &> self.model_pbuf_bytes)
-        mod_ptr = <size_t> cuml_model_ptr
-        treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return treelite_handle
-
-    """
     def _get_protobuf_bytes_common(self, model):
         fit_mod_ptr = model._obtain_treelite_handle()
         cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index d9270a3ead..dc7b197bd2 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -114,12 +114,25 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
 
     ctypedef fused fused_rf_meta:
        RandomForestMetaData[float, float]
-       RandomForestMetaData[double, double]
+       #RandomForestMetaData[double, double]
        RandomForestMetaData[float, int]
-       RandomForestMetaData[double, int]
+       #RandomForestMetaData[double, int]
 
-    cdef fused_rf_meta *meta
+    ctypedef fused X_dtype:
+        cython.float
+        cython.double
 
+    ctypedef fused y_dtype:
+        cython.int
+        cython.float
+        cython.double
+
+    cpdef fused_rf_meta *meta
+
+    cdef RandomForestMetaData[T, L]* create_meta[T, L](T a, L b)
+    #cdef fused_rf_meta *meta
+    #fused_rf_meta = cython.fused(RandomForestRegressorF, RandomForestRegressorD,
+    #                             RandomForestClassifierF, RandomForestClassifierD)
     #
     # Treelite handling
     #
@@ -157,10 +170,3 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
     cdef ModelHandle concatenate_trees(
         vector[ModelHandle] &treelite_handles) except +
 
-    cdef void predict[T, L](cumlHandle& handle,
-                      RandomForestMetaData[T, L] *,
-                      T*,
-                      int,
-                      int,
-                      L*,
-                      int) except +
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index b356e8d901..a811b32518 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -69,13 +69,13 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   RF_params,
                   int) except +
 
-    cdef void predict[T, T](cumlHandle& handle,
-                            RandomForestMetaData[T, T] *,
-                            T*,
-                            int,
-                            int,
-                            T*,
-                            int) except +
+    cdef void predict[T](cumlHandle& handle,
+                         RandomForestMetaData[T, T] *,
+                         T*,
+                         int,
+                         int,
+                         T*,
+                         int) except +
 
     cdef RF_metrics score(cumlHandle& handle,
                           RandomForestMetaData[float, float]*,
@@ -417,21 +417,21 @@ class RandomForestRegressor(BaseRandomForestModel):
             <RandomForestMetaData[double, double]*><uintptr_t> self.rf_forest64
         if self.dtype == np.float32:
             predict(handle_[0],
-                    rf_forest,
-                    <float*> X_ptr,
-                    <int> n_rows,
-                    <int> n_cols,
-                    <float*> preds_ptr,
-                    <int> self.verbosity)
+                           rf_forest,
+                           <float*> X_ptr,
+                           <int> n_rows,
+                           <int> n_cols,
+                           <float*> preds_ptr,
+                           <int> self.verbosity)
 
         elif self.dtype == np.float64:
             predict(handle_[0],
-                    rf_forest64,
-                    <double*> X_ptr,
-                    <int> n_rows,
-                    <int> n_cols,
-                    <double*> preds_ptr,
-                    <int> self.verbosity)
+                            rf_forest64,
+                            <double*> X_ptr,
+                            <int> n_rows,
+                            <int> n_cols,
+                            <double*> preds_ptr,
+                            <int> self.verbosity)
         else:
             raise TypeError("supports only np.float32 and np.float64 input,"
                             " but input of type '%s' passed."

From 6e64046fdf27ad76c34a1ec3bccfd6301411dfbd Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Thu, 21 May 2020 12:14:42 -0500
Subject: [PATCH 06/32] updated rf cython code

---
 python/cuml/ensemble/base.pyx                 | 243 ++++++++++++++++++
 python/cuml/ensemble/randomforest_common.pyx  | 237 +++++++++--------
 python/cuml/ensemble/randomforest_shared.pxd  |  22 --
 .../cuml/ensemble/randomforestclassifier.pyx  | 178 ++++++++++---
 .../cuml/ensemble/randomforestregressor.pyx   | 189 +++++++++++---
 5 files changed, 665 insertions(+), 204 deletions(-)
 create mode 100644 python/cuml/ensemble/base.pyx

diff --git a/python/cuml/ensemble/base.pyx b/python/cuml/ensemble/base.pyx
new file mode 100644
index 0000000000..1549dfd3cc
--- /dev/null
+++ b/python/cuml/ensemble/base.pyx
@@ -0,0 +1,243 @@
+import ctypes
+import cupy as cp
+import math
+import warnings
+
+import numpy as np
+from cuml import ForestInference
+from cuml.fil.fil import TreeliteModel as tl
+from cuml.common.handle import Handle
+from cuml.common.base import Base
+from cuml.common.array import CumlArray
+
+from cython.operator cimport dereference as deref
+
+from cuml.ensemble.randomforest_shared cimport *
+from cuml.common import input_to_cuml_array, rmm_cupy_ary
+
+
+
+class BaseRandomForestModel(Base):
+    variables = ['n_estimators', 'max_depth', 'handle',
+                 'max_features', 'n_bins',
+                 'split_algo', 'split_criterion', 'min_rows_per_node',
+                 'min_impurity_decrease',
+                 'bootstrap', 'bootstrap_features',
+                 'verbose', 'rows_sample',
+                 'max_leaves', 'quantile_per_tree']
+
+    def _create_model(self, model, seed, split_criterion,
+                      n_streams, n_estimators=100,
+                      max_depth=16, handle=None, max_features='auto',
+                      n_bins=8, split_algo=1, bootstrap=True,
+                      bootstrap_features=False,
+                      verbose=False, min_rows_per_node=2,
+                      rows_sample=1.0, max_leaves=-1,
+                      accuracy_metric=None, dtype=None,
+                      output_type=None, min_samples_leaf=None,
+                      min_weight_fraction_leaf=None, n_jobs=None,
+                      max_leaf_nodes=None, min_impurity_decrease=0.0,
+                      min_impurity_split=None, oob_score=None,
+                      random_state=None, warm_start=None, class_weight=None,
+                      quantile_per_tree=False, criterion=None):
+
+        if accuracy_metric:
+            model.variables.append('accuracy_metric')
+        sklearn_params = {"criterion": criterion,
+                          "min_samples_leaf": min_samples_leaf,
+                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
+                          "max_leaf_nodes": max_leaf_nodes,
+                          "min_impurity_split": min_impurity_split,
+                          "oob_score": oob_score, "n_jobs": n_jobs,
+                          "random_state": random_state,
+                          "warm_start": warm_start,
+                          "class_weight": class_weight}
+
+        for key, vals in sklearn_params.items():
+            if vals is not None:
+                raise TypeError(" The Scikit-learn variable ", key,
+                                " is not supported in cuML,"
+                                " please read the cuML documentation for"
+                                " more information")
+
+        if handle is None:
+            handle = Handle(n_streams)
+
+        super(model, self).__init__(handle=handle,
+                                    verbose=verbose,
+                                    output_type=output_type)
+        if max_depth < 0:
+            raise ValueError("Must specify max_depth >0 ")
+
+        self.split_algo = split_algo
+        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
+                          '3': MAE, '4': CRITERION_END}
+        if str(split_criterion) not in criterion_dict.keys():
+            warnings.warn("The split criterion chosen was not present"
+                          " in the list of options accepted by the model"
+                          " and so the CRITERION_END option has been chosen.")
+            self.split_criterion = CRITERION_END
+        else:
+            self.split_criterion = criterion_dict[str(split_criterion)]
+
+        self.min_rows_per_node = min_rows_per_node
+        self.min_impurity_decrease = min_impurity_decrease
+        self.bootstrap_features = bootstrap_features
+        self.rows_sample = rows_sample
+        self.max_leaves = max_leaves
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.verbose = verbose
+        self.n_bins = n_bins
+        self.n_cols = None
+        self.dtype = dtype
+        self.accuracy_metric = accuracy_metric
+        self.quantile_per_tree = quantile_per_tree
+        self.n_streams = handle.getNumInternalStreams()
+        self.seed = seed
+        self.model_pbuf_bytes = bytearray()
+        self.treelite_handle = None
+
+    def _dataset_setup(self, X, y, convert_dtype):
+        self._set_output_type(X)
+
+        # Reset the old tree data for new fit call
+        self._reset_forest_data()
+
+        #cdef uintptr_t X_ptr, y_ptr
+
+        X_m, self.n_rows, self.n_cols, self.dtype = \
+            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
+                                order='F')
+        if self.RF_type == CLASSIFICATION:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y, check_dtype=np.int32,
+                                    convert_to_dtype=(np.int32 if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+            if y_dtype != np.int32:
+                raise TypeError("The labels `y` need to be of dtype `np.int32`")
+            unique_labels = rmm_cupy_ary(cp.unique, y_m)
+            self.num_classes = len(unique_labels)
+            for i in range(self.num_classes):
+                if i not in unique_labels:
+                    raise ValueError("The labels need "
+                                     "to be consecutive values from "
+                                     "0 to the number of unique label values")
+        else:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y,
+                                    convert_to_dtype=(self.dtype if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+
+        if self.dtype == np.float64:
+            warnings.warn("To use GPU-based prediction, first train using \
+                          float 32 data to fit the estimator.")
+
+        max_feature_val = self._get_max_feat_val()
+        if type(self.min_rows_per_node) == float:
+            self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows)
+
+        return X_m, y_m, max_feature_val
+
+    def _get_params(self, model, deep):
+        params = dict()
+        for key in model.variables:
+            if key in ['handle']:
+                continue
+            var_value = getattr(self, key, None)
+            params[key] = var_value
+        return params
+
+    def _set_params(self, model, **params):
+        self.handle.__setstate__(self.n_streams)
+        self.model_pbuf_bytes = []
+
+        if not params:
+            return self
+        for key, value in params.items():
+            if key not in model.variables:
+                raise ValueError('Invalid parameter for estimator')
+            else:
+                setattr(self, key, value)
+        return self
+
+
+def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
+    storage_format = _check_fil_sparse_format_value(fil_sparse_format)
+    if (depth > 16 and (storage_format == 'dense' or
+                        algo == 'tree_reorg' or
+                        algo == 'batch_tree_reorg')):
+        raise ValueError("While creating a forest with max_depth greater "
+                         "than 16, `fil_sparse_format` should be True. "
+                         "If `fil_sparse_format=False` then the memory"
+                         "consumed while creating the FIL forest is very "
+                         "large and the process will be aborted. In "
+                         "addition, `algo` must be either set to `naive' "
+                         "or `auto` to set 'fil_sparse_format=True`.")
+    print(" storage_format : ", storage_format)
+    return storage_format
+
+
+def _check_fil_sparse_format_value(fil_sparse_format):
+    accepted_vals = [True, False, 'auto']
+    if fil_sparse_format == 'auto':
+        storage_format = fil_sparse_format
+    elif not fil_sparse_format:
+        storage_format = 'dense'
+    elif fil_sparse_format not in accepted_vals:
+        raise ValueError("The value entered for spares_forest is not "
+                         "supported. Please refer to the documentation "
+                         "to see the accepted values.")
+    else:
+        storage_format = 'sparse'
+    print(" storage_format : ", storage_format)
+    return storage_format
+
+
+def _obtain_treelite_model(treelite_handle):
+    """
+    Creates a Treelite model using the treelite handle
+    obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    tl_to_fil_model : Treelite version of this model
+    """
+    treelite_model = \
+        tl.from_treelite_model_handle(treelite_handle)
+    return treelite_model
+
+
+def _obtain_fil_model(treelite_handle, depth,
+                      output_class=True,
+                      threshold=0.5, algo='auto',
+                      fil_sparse_format='auto'):
+    """
+    Creates a Forest Inference (FIL) model using the treelite
+    handle obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    fil_model :
+        A Forest Inference model which can be used to perform
+        inferencing on the random forest model.
+    """
+    print(" treelite handle in obt fil : ", treelite_handle)
+    storage_format = \
+        _check_fil_parameter_validity(depth=depth,
+                                      fil_sparse_format=fil_sparse_format,
+                                      algo=algo)
+
+    fil_model = ForestInference()
+    tl_to_fil_model = \
+        fil_model.load_from_randomforest(treelite_handle,
+                                         output_class=output_class,
+                                         threshold=threshold,
+                                         algo=algo,
+                                         storage_type=storage_format)
+
+    return tl_to_fil_model
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index ee8dcb78e3..e9225847ce 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -33,58 +33,7 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 cimport cython
 
-
-cdef class BaseRandomForestModel_impl():
-    cpdef creat_meta(self, X_dtype a, y_dtype b):
-        meta = \
-            new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]()
-        cdef RandomForestMetaData[float, int] *rf_class
-        cdef RandomForestMetaData[float, float] *rf_reg
-        if cython.typeof(b) == cython.int:
-            rf_class = <RandomForestMetaData[float, int]*><size_t> self.rf_forest
-            meta = rf_class
-        else:
-            rf_reg = <RandomForestMetaData[float, float]*><size_t> self.rf_forest
-            meta = rf_reg
-        
-    def _obtain_treelite_handle(self):
-        if self.treelite_handle:
-            return self.treelite_handle
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-
-        mod_ptr = <uintptr_t> cuml_model_ptr
-        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        cdef cython.float a = 10.0
-        cdef cython.int b = 5
-        base_rf = BaseRandomForestModel_impl()
-        base_rf.creat_meta(a, b)
-        if self.RF_type == CLASSIFICATION:
-            build_treelite_forest(
-                & cuml_model_ptr,
-                #<RandomForestMetaData[float, int]*><size_t> self.rf_forest,
-                meta,
-                <int> self.n_cols,
-                <int> self.num_classes,
-                model_pbuf_vec)
-        else:
-            build_treelite_forest(
-                & cuml_model_ptr,
-                #<RandomForestMetaData[float, float]*><size_t> self.rf_forest,
-                meta,
-                <int> self.n_cols,
-                <int> self.num_classes,
-                model_pbuf_vec)
-            mod_ptr = <uintptr_t> cuml_model_ptr
-            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-          
-        return self.treelite_handle
-
-
+# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff
 class BaseRandomForestModel(Base):
     variables = ['n_estimators', 'max_depth', 'handle',
                  'max_features', 'n_bins',
@@ -94,9 +43,6 @@ class BaseRandomForestModel(Base):
                  'verbose', 'rows_sample',
                  'max_leaves', 'quantile_per_tree']
 
-    def __init__(self):
-        self._impl = BaseRandomForestModel_impl()
-
     def _create_model(self, model, seed, split_criterion,
                       n_streams, n_estimators=100,
                       max_depth=16, handle=None, max_features='auto',
@@ -170,12 +116,7 @@ class BaseRandomForestModel(Base):
         self.seed = seed
         self.model_pbuf_bytes = bytearray()
         self.treelite_handle = None
-       # if self.model_type == curfr:
-       # print have a check for the random forest meta data in init
-    """
-    def _check_rf_meta_data_format(self, task_category):
-      if task_category == CLASSIFICATION
-    """
+
     def _get_max_feat_val(self):
         if type(self.max_features) == int:
             return self.max_features/self.n_cols
@@ -194,9 +135,67 @@ class BaseRandomForestModel(Base):
             raise ValueError("Wrong value passed in for max_features"
                              " please read the documentation")
 
+    def _get_protobuf_bytes(self):
+        """
+        Returns the self.model_pbuf_bytes.
+        Cuml RF model gets converted to treelite protobuf bytes by:
+            1. converting the cuml RF model to a treelite model. The treelite
+            models handle (pointer) is returned
+            2. The treelite model handle is used to convert the treelite model
+            to a treelite protobuf model which is stored in a temporary file.
+            The protobuf model information is read from the temporary file and
+            the byte information is returned.
+        The treelite handle is stored `self.treelite_handle` and the treelite
+        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
+        of information is already present in the model then the respective
+        step is skipped.
+        """
+        if self.model_pbuf_bytes:
+            return self.model_pbuf_bytes
+        elif self.treelite_handle:
+            fit_mod_ptr = self.treelite_handle
+        else:
+            fit_mod_ptr = self._obtain_treelite_handle()
+        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self.model_pbuf_bytes
+
     def _obtain_treelite_handle(self):
-          
-        return self._impl._obtain_treelite_handle
+        if self.treelite_handle:
+            print(" treelite handle in obt : ", self.treelite_handle)
+            return self.treelite_handle # Use cached version
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef unsigned char[::1] model_pbuf_mv
+        cdef vector[unsigned char] model_pbuf_vec
+        if self.model_pbuf_bytes:
+            model_pbuf_mv = self.model_pbuf_bytes
+            with cython.boundscheck(False):
+                model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                      & model_pbuf_mv[model_pbuf_mv.shape[0]])
+        else:
+            model_pbuf_vec = <vector[unsigned char]&> bytearray()
+        if self.RF_type == CLASSIFICATION:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+        else:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> REGRESSION_MODEL,
+                model_pbuf_vec)
+
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return self.treelite_handle
 
     def _dataset_setup(self, X, y, convert_dtype):
         self._set_output_type(X)
@@ -204,8 +203,6 @@ class BaseRandomForestModel(Base):
         # Reset the old tree data for new fit call
         self._reset_forest_data()
 
-        #cdef uintptr_t X_ptr, y_ptr
-
         X_m, self.n_rows, self.n_cols, self.dtype = \
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
                                 order='F')
@@ -241,48 +238,70 @@ class BaseRandomForestModel(Base):
 
         return X_m, y_m, max_feature_val
 
-    def _get_protobuf_bytes(self):
-        """
-        Returns the self.model_pbuf_bytes.
-        Cuml RF model gets converted to treelite protobuf bytes by:
-            1. converting the cuml RF model to a treelite model. The treelite
-            models handle (pointer) is returned
-            2. The treelite model handle is used to convert the treelite model
-            to a treelite protobuf model which is stored in a temporary file.
-            The protobuf model information is read from the temporary file and
-            the byte information is returned.
-        The treelite handle is stored `self.treelite_handle` and the treelite
-        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
-        of information is already present in the model then the respective
-        step is skipped.
-        """
-        if self.model_pbuf_bytes:
-            return self.model_pbuf_bytes
-        elif self.treelite_handle:
-            fit_mod_ptr = self.treelite_handle
-        else:
-            fit_mod_ptr = self._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self.model_pbuf_bytes
+    """
+    def _predict_model_on_gpu(self, X,
+                              algo,
+                              convert_dtype,
+                              fil_sparse_format,
+                              output_class=False,
+                              threshold=0.5,
+                              predict_proba=False):
+        out_type = self._get_output_type(X)
+        cdef ModelHandle cuml_model_ptr = NULL
+        _, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='F',
+                                check_cols=self.n_cols)
 
+        if dtype == np.float64 and not convert_dtype:
+            raise TypeError("GPU based predict only accepts np.float32 data. \
+                            Please set convert_dtype=True to convert the test \
+                            data to the same dtype as the data used to train, \
+                            ie. np.float32. If you would like to use test \
+                            data of dtype=np.float64 please set \
+                            predict_model='CPU' to use the CPU implementation \
+                            of predict.")
+
+        self._obtain_treelite_handle()
+        storage_type = \
+            _check_fil_parameter_validity(depth=self.max_depth,
+                                          fil_sparse_format=fil_sparse_format,
+                                          algo=algo)
+
+        fil_model = ForestInference()
+        tl_to_fil_model = \
+            fil_model.load_from_randomforest(self.treelite_handle,
+                                             output_class=output_class,
+                                             threshold=threshold,
+                                             algo=algo,
+                                             storage_type=storage_type)
+
+        preds = tl_to_fil_model.predict(X, output_type=out_type,
+                                        predict_proba=predict_proba)
+        tl.free_treelite_model(self.treelite_handle)
+        return preds
+    """
+   
     def _tl_model_handles(self, model_bytes):
         cdef ModelHandle cuml_model_ptr = NULL
-        meta = <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-        task_category = CLASSIFICATION_MODEL
-        build_treelite_forest(& cuml_model_ptr,
-                              meta,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> model_bytes)
+        if self.RF_type == CLASSIFICATION:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                <vector[unsigned char] &> model_bytes)
+        else:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> 1,
+                <vector[unsigned char] &> model_bytes)
         mod_handle = <uintptr_t> cuml_model_ptr
 
         return ctypes.c_void_p(mod_handle).value
 
+
     def _concatenate_treelite_handle(self, treelite_handle):
         cdef ModelHandle concat_model_handle = NULL
         cdef vector[ModelHandle] *model_handles \
@@ -292,11 +311,11 @@ class BaseRandomForestModel(Base):
             mod_ptr = <uintptr_t>i
             model_handles.push_back((
                 <ModelHandle> mod_ptr))
-
+        print(" run the concat c++ func")
         concat_model_handle = concatenate_trees(deref(model_handles))
-
         cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
         self.treelite_handle = concat_model_ptr
+        print(" treelite handle in concat : ", self.treelite_handle)
         cdef vector[unsigned char] pbuf_mod_info = \
             save_model(<ModelHandle> concat_model_ptr)
         cdef unsigned char[::1] pbuf_mod_view = \
@@ -304,7 +323,8 @@ class BaseRandomForestModel(Base):
         self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
         return self
 
-    def _predict_model_on_gpu(self, model, X, algo, convert_dtype,
+    
+    def _predict_model_on_gpu(self, X, algo, convert_dtype,
                               fil_sparse_format, threshold=0.5,
                               output_class=False, predict_proba=False):
         out_type = self._get_output_type(X)
@@ -339,8 +359,9 @@ class BaseRandomForestModel(Base):
         preds = tl_to_fil_model.predict(X, output_type=out_type,
                                         predict_proba=predict_proba)
         tl.free_treelite_model(self.treelite_handle)
+        self.treelite_handle = None
         return preds
-
+    
     def _get_params(self, model, deep):
         params = dict()
         for key in model.variables:
@@ -363,12 +384,6 @@ class BaseRandomForestModel(Base):
                 setattr(self, key, value)
         return self
 
-    def _get_protobuf_bytes_common(self, model):
-        fit_mod_ptr = model._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        model_protobuf_bytes = save_model(<ModelHandle> model_ptr)
-        return model_protobuf_bytes
-
 
 def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
     storage_format = _check_fil_sparse_format_value(fil_sparse_format)
@@ -397,7 +412,6 @@ def _check_fil_sparse_format_value(fil_sparse_format):
                          "to see the accepted values.")
     else:
         storage_format = 'sparse'
-
     return storage_format
 
 
@@ -429,7 +443,6 @@ def _obtain_fil_model(treelite_handle, depth,
         A Forest Inference model which can be used to perform
         inferencing on the random forest model.
     """
-
     storage_format = \
         _check_fil_parameter_validity(depth=depth,
                                       fil_sparse_format=fil_sparse_format,
diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index dc7b197bd2..05abfa7135 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -111,28 +111,6 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
         ctypedef TreeMetaDataNode[T, L]* trees
         RF_params rf_params
 
-
-    ctypedef fused fused_rf_meta:
-       RandomForestMetaData[float, float]
-       #RandomForestMetaData[double, double]
-       RandomForestMetaData[float, int]
-       #RandomForestMetaData[double, int]
-
-    ctypedef fused X_dtype:
-        cython.float
-        cython.double
-
-    ctypedef fused y_dtype:
-        cython.int
-        cython.float
-        cython.double
-
-    cpdef fused_rf_meta *meta
-
-    cdef RandomForestMetaData[T, L]* create_meta[T, L](T a, L b)
-    #cdef fused_rf_meta *meta
-    #fused_rf_meta = cython.fused(RandomForestRegressorF, RandomForestRegressorD,
-    #                             RandomForestClassifierF, RandomForestClassifierD)
     #
     # Treelite handling
     #
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 5a98251932..e6de676912 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -33,9 +33,10 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uintptr_t
 from libc.stdlib cimport calloc, malloc, free
 
+from cython.operator cimport dereference as deref
+
 from cuml import ForestInference
 from cuml.common.array import CumlArray
-from cuml.common.base import Base
 from cuml.common.handle import Handle
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
 
@@ -75,13 +76,21 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   RF_params,
                   int) except +
 
-    cdef void predict[T, L](cumlHandle& handle,
-                            RandomForestMetaData[T, L] *,
-                            T*,
-                            int,
-                            int,
-                            L*,
-                            int) except +
+    cdef void predict(cumlHandle& handle,
+                      RandomForestMetaData[float, int] *,
+                      float*,
+                      int,
+                      int,
+                      int*,
+                      bool) except +
+
+    cdef void predict(cumlHandle& handle,
+                      RandomForestMetaData[double, int]*,
+                      double*,
+                      int,
+                      int,
+                      int*,
+                      bool) except +
 
     cdef void predictGetAll(cumlHandle& handle,
                             RandomForestMetaData[float, int] *,
@@ -299,6 +308,62 @@ class RandomForestClassifier(BaseRandomForestModel):
             free(<RandomForestMetaData[double, int]*><uintptr_t>
                  self.rf_forest64)
 
+    """
+    def _obtain_treelite_handle(self):
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef RandomForestMetaData[float, int] *rf_forest = \
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
+        if self.num_classes > 2:
+            raise NotImplementedError("Pickling for multi-class "
+                                      "classification models is currently not "
+                                      "implemented. Please check cuml issue "
+                                      "#1679 for more information.")
+        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
+        cdef vector[unsigned char] model_pbuf_vec
+        with cython.boundscheck(False):
+            model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
+        if self.treelite_handle is None:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+            mod_ptr = <uintptr_t> cuml_model_ptr
+            self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return self.treelite_handle
+    
+    def _get_protobuf_bytes(self):
+        
+        Returns the self.model_pbuf_bytes.
+        Cuml RF model gets converted to treelite protobuf bytes by:
+            1. converting the cuml RF model to a treelite model. The treelite
+            models handle (pointer) is returned
+            2. The treelite model handle is used to convert the treelite model
+            to a treelite protobuf model which is stored in a temporary file.
+            The protobuf model information is read from the temporary file and
+            the byte information is returned.
+        The treelite handle is stored `self.treelite_handle` and the treelite
+        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
+        of information is already present in the model then the respective
+        step is skipped.
+        
+        if self.model_pbuf_bytes:
+            return self.model_pbuf_bytes
+        elif self.treelite_handle:
+            fit_mod_ptr = self.treelite_handle
+        else:
+            fit_mod_ptr = self._obtain_treelite_handle()
+        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self.model_pbuf_bytes
+      """
+
     def convert_to_treelite_model(self):
         """
         Converts the cuML RF model to a Treelite model
@@ -356,7 +421,10 @@ class RandomForestClassifier(BaseRandomForestModel):
             A Forest Inference model which can be used to perform
             inferencing on the random forest model.
         """
+        print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%")
+        print(" self.treelite handle in conv to fil : ", self.treelite_handle)
         treelite_handle = self._obtain_treelite_handle()
+        print(" self.treelite_handle in convert_to_fil : ", self.treelite_handle)
         return _obtain_fil_model(treelite_handle=treelite_handle,
                                  depth=self.max_depth,
                                  output_class=output_class,
@@ -367,20 +435,28 @@ class RandomForestClassifier(BaseRandomForestModel):
     """
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
+    
+
+    def _concatenate_treelite_handle(self, treelite_handle):
+        cdef ModelHandle concat_model_handle = NULL
+        cdef vector[ModelHandle] *model_handles \
+            = new vector[ModelHandle]()
+        cdef uintptr_t mod_ptr
+        for i in treelite_handle:
+            mod_ptr = <uintptr_t>i
+            model_handles.push_back((
+                <ModelHandle> mod_ptr))
+
+        concat_model_handle = concatenate_trees(deref(model_handles))
+        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
+        self.treelite_handle = concat_model_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> concat_model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self
     """
-    def _tl_model_handles(self, model_bytes):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-        task_category = CLASSIFICATION_MODEL
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> model_bytes)
-        mod_handle = <uintptr_t> cuml_model_ptr
-
-        return ctypes.c_void_p(mod_handle).value
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -472,6 +548,54 @@ class RandomForestClassifier(BaseRandomForestModel):
         del y_m
         return self
     """
+    def _predict_model_on_gpu(self, X, output_class,
+                              threshold, algo,
+                              convert_dtype,
+                              fil_sparse_format, predict_proba):
+        out_type = self._get_output_type(X)
+        cdef ModelHandle cuml_model_ptr = NULL
+        _, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='F',
+                                check_cols=self.n_cols)
+
+        if dtype == np.float64 and not convert_dtype:
+            raise TypeError("GPU based predict only accepts np.float32 data. \
+                            Please set convert_dtype=True to convert the test \
+                            data to the same dtype as the data used to train, \
+                            ie. np.float32. If you would like to use test \
+                            data of dtype=np.float64 please set \
+                            predict_model='CPU' to use the CPU implementation \
+                            of predict.")
+
+        cdef RandomForestMetaData[float, int] *rf_forest = \
+            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
+
+        build_treelite_forest(& cuml_model_ptr,
+                              rf_forest,
+                              <int> n_cols,
+                              <int> self.num_classes,
+                              <vector[unsigned char] &> self.model_pbuf_bytes)
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        treelite_handle = ctypes.c_void_p(mod_ptr).value
+
+        storage_type = \
+            _check_fil_parameter_validity(depth=self.max_depth,
+                                          fil_sparse_format=fil_sparse_format,
+                                          algo=algo)
+
+        fil_model = ForestInference()
+        tl_to_fil_model = \
+            fil_model.load_from_randomforest(treelite_handle,
+                                             output_class=output_class,
+                                             threshold=threshold,
+                                             algo=algo,
+                                             storage_type=storage_type)
+
+        preds = tl_to_fil_model.predict(X, output_type=out_type,
+                                        predict_proba=predict_proba)
+        tl.free_treelite_model(treelite_handle)
+        return preds
+      """
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr
@@ -519,12 +643,11 @@ class RandomForestClassifier(BaseRandomForestModel):
         # synchronous w/o a stream
         del(X_m)
         return preds.to_output(out_type)
-        """
 
     def predict(self, X, predict_model="GPU",
                 output_class=True, threshold=0.5,
                 algo='auto',
-                num_classes=2, convert_dtype=True,
+                convert_dtype=True,
                 fil_sparse_format='auto'):
         """
         Predicts the labels for X.
@@ -599,8 +722,7 @@ class RandomForestClassifier(BaseRandomForestModel):
 
         else:
             preds = \
-                self._predict_model_on_gpu(model=RandomForestClassifier,
-                                           X=X, output_class=output_class,
+                self._predict_model_on_gpu(X=X, output_class=output_class,
                                            threshold=threshold,
                                            algo=algo,
                                            convert_dtype=convert_dtype,
@@ -671,7 +793,7 @@ class RandomForestClassifier(BaseRandomForestModel):
 
     def predict_proba(self, X, output_class=True,
                       threshold=0.5, algo='auto',
-                      num_classes=2, convert_dtype=True,
+                      convert_dtype=True,
                       fil_sparse_format='auto'):
         """
         Predicts class probabilites for X. This function uses the GPU
@@ -745,8 +867,7 @@ class RandomForestClassifier(BaseRandomForestModel):
                                       "implemented. Please check cuml issue "
                                       "#1679 for more information.")
         preds_proba = \
-            self._predict_model_on_gpu(model=RandomForestClassifier,
-                                       X=X, output_class=output_class,
+            self._predict_model_on_gpu(X, output_class=output_class,
                                        threshold=threshold,
                                        algo=algo,
                                        convert_dtype=convert_dtype,
@@ -821,7 +942,6 @@ class RandomForestClassifier(BaseRandomForestModel):
         y_ptr = y_m.ptr
         preds = self.predict(X, output_class=True,
                              threshold=threshold, algo=algo,
-                             num_classes=num_classes,
                              convert_dtype=convert_dtype,
                              predict_model=predict_model,
                              fil_sparse_format=fil_sparse_format)
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index a811b32518..2d3a9a4af0 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -21,6 +21,7 @@
 
 import ctypes
 import cudf
+import math
 import numpy as np
 import warnings
 
@@ -31,7 +32,6 @@ from libc.stdlib cimport calloc, malloc, free
 
 from cuml import ForestInference
 from cuml.common.array import CumlArray
-from cuml.common.base import Base
 from cuml.common.handle import Handle
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
 from cuml.common.handle cimport cumlHandle
@@ -68,14 +68,22 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   double*,
                   RF_params,
                   int) except +
-
-    cdef void predict[T](cumlHandle& handle,
-                         RandomForestMetaData[T, T] *,
-                         T*,
-                         int,
-                         int,
-                         T*,
-                         int) except +
+    
+    cdef void predict(cumlHandle& handle,
+                      RandomForestMetaData[float, float] *,
+                      float*,
+                      int,
+                      int,
+                      float*,
+                      int) except +
+
+    cdef void predict(cumlHandle& handle,
+                      RandomForestMetaData[double, double]*,
+                      double*,
+                      int,
+                      int,
+                      double*,
+                      int) except +
 
     cdef RF_metrics score(cumlHandle& handle,
                           RandomForestMetaData[float, float]*,
@@ -281,9 +289,6 @@ class RandomForestRegressor(BaseRandomForestModel):
                  self.rf_forest)
             free(<RandomForestMetaData[double, double]*><uintptr_t>
                  self.rf_forest64)
-            self.treelite_handle = None
-            self.model_pbuf_bytes = bytearray()
-
 
     def convert_to_treelite_model(self):
         """
@@ -297,24 +302,80 @@ class RandomForestRegressor(BaseRandomForestModel):
 
         return _obtain_treelite_model(handle)
 
+    def convert_to_fil_model(self, output_class=False,
+                             algo='auto',
+                             fil_sparse_format='auto'):
+        """
+        Create a Forest Inference (FIL) model from the trained cuML
+        Random Forest model.
+        Parameters
+        ----------
+        output_class : boolean (default = True)
+            This is optional and required only while performing the
+            predict operation on the GPU.
+            If true, return a 1 or 0 depending on whether the raw
+            prediction exceeds the threshold. If False, just return
+            the raw prediction.
+        algo : string (default = 'auto')
+            This is optional and required only while performing the
+            predict operation on the GPU.
+            'naive' - simple inference using shared memory
+            'tree_reorg' - similar to naive but trees rearranged to be more
+            coalescing-friendly
+            'batch_tree_reorg' - similar to tree_reorg but predicting
+            multiple rows per thread block
+            `auto` - choose the algorithm automatically. Currently
+            'batch_tree_reorg' is used for dense storage
+            and 'naive' for sparse storage
+        fil_sparse_format : boolean or string (default = auto)
+            This variable is used to choose the type of forest that will be
+            created in the Forest Inference Library. It is not required
+            while using predict_model='CPU'.
+            'auto' - choose the storage type automatically
+            (currently True is chosen by auto)
+            False - create a dense forest
+            True - create a sparse forest, requires algo='naive'
+            or algo='auto'
+        Returns
+        ----------
+        fil_model :
+            A Forest Inference model which can be used to perform
+            inferencing on the random forest model.
+        """
+        treelite_handle = self._obtain_treelite_handle()
+        return _obtain_fil_model(treelite_handle=treelite_handle,
+                                 depth=self.max_depth,
+                                 output_class=output_class,
+                                 algo=algo,
+                                 fil_sparse_format=fil_sparse_format)
+
 
     """
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
     """
-    def _tl_model_handles(self, model_bytes):
-        task_category = REGRESSION_MODEL
-        cdef ModelHandle tl_model_ptr = NULL
-        cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
-        build_treelite_forest(& tl_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> model_bytes)
-        mod_handle = <uintptr_t> tl_model_ptr
-
-        return ctypes.c_void_p(mod_handle).value
+    """
+    def _concatenate_treelite_handle(self, treelite_handle):
+        cdef ModelHandle concat_model_handle = NULL
+        cdef vector[ModelHandle] *model_handles \
+            = new vector[ModelHandle]()
+        cdef uintptr_t mod_ptr
+        for i in treelite_handle:
+            mod_ptr = <uintptr_t>i
+            model_handles.push_back((
+                <ModelHandle> mod_ptr))
+
+        concat_model_handle = concatenate_trees(deref(model_handles))
+
+        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
+        self.treelite_handle = concat_model_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> concat_model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self
+    """
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -393,6 +454,53 @@ class RandomForestRegressor(BaseRandomForestModel):
         del X_m
         del y_m
         return self
+    
+    """
+    def _predict_model_on_gpu(self, X, algo, convert_dtype,
+                              fil_sparse_format):
+        out_type = self._get_output_type(X)
+        cdef ModelHandle cuml_model_ptr = NULL
+        _, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='F',
+                                check_cols=self.n_cols)
+
+        if dtype == np.float64 and not convert_dtype:
+            raise TypeError("GPU based predict only accepts np.float32 data. \
+                            Please set convert_dtype=True to convert the test \
+                            data to the same dtype as the data used to train, \
+                            ie. np.float32. If you would like to use test \
+                            data of dtype=np.float64 please set \
+                            predict_model='CPU' to use the CPU implementation \
+                            of predict.")
+
+        cdef RandomForestMetaData[float, float] *rf_forest = \
+            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
+
+        task_category = REGRESSION_MODEL
+        build_treelite_forest(& cuml_model_ptr,
+                              rf_forest,
+                              <int> n_cols,
+                              <int> task_category,
+                              <vector[unsigned char] &> self.model_pbuf_bytes)
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        treelite_handle = ctypes.c_void_p(mod_ptr).value
+
+        storage_type = \
+            _check_fil_parameter_validity(depth=self.max_depth,
+                                          fil_sparse_format=fil_sparse_format,
+                                          algo=algo)
+
+        fil_model = ForestInference()
+        tl_to_fil_model = \
+            fil_model.load_from_randomforest(treelite_handle,
+                                             output_class=False,
+                                             algo=algo,
+                                             storage_type=storage_type)
+
+        preds = tl_to_fil_model.predict(X, out_type)
+        tl.free_treelite_model(treelite_handle)
+        return preds
+    """
 
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
@@ -404,7 +512,7 @@ class RandomForestRegressor(BaseRandomForestModel):
                                 check_cols=self.n_cols)
         X_ptr = X_m.ptr
 
-        preds = CumlArray.zeros(n_rows, dtype=np.int32)
+        preds = CumlArray.zeros(n_rows, dtype=dtype)
         cdef uintptr_t preds_ptr = preds.ptr
 
         cdef cumlHandle* handle_ =\
@@ -417,23 +525,23 @@ class RandomForestRegressor(BaseRandomForestModel):
             <RandomForestMetaData[double, double]*><uintptr_t> self.rf_forest64
         if self.dtype == np.float32:
             predict(handle_[0],
-                           rf_forest,
-                           <float*> X_ptr,
-                           <int> n_rows,
-                           <int> n_cols,
-                           <float*> preds_ptr,
-                           <int> self.verbosity)
+                    rf_forest,
+                    <float*> X_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <float*> preds_ptr,
+                    <int> self.verbosity)
 
         elif self.dtype == np.float64:
             predict(handle_[0],
-                            rf_forest64,
-                            <double*> X_ptr,
-                            <int> n_rows,
-                            <int> n_cols,
-                            <double*> preds_ptr,
-                            <int> self.verbosity)
+                    rf_forest64,
+                    <double*> X_ptr,
+                    <int> n_rows,
+                    <int> n_cols,
+                    <double*> preds_ptr,
+                    <int> self.verbosity)
         else:
-            raise TypeError("supports only np.float32 and np.float64 input,"
+            raise TypeError("supports only float32 and float64 input,"
                             " but input of type '%s' passed."
                             % (str(self.dtype)))
 
@@ -501,8 +609,7 @@ class RandomForestRegressor(BaseRandomForestModel):
                             setting predict_model = 'CPU'")
 
         else:
-            preds = self._predict_model_on_gpu(model=RandomForestRegressor,
-                                               X=X,
+            preds = self._predict_model_on_gpu(X=X,
                                                algo=algo,
                                                convert_dtype=convert_dtype,
                                                fil_sparse_format=fil_sparse_format)

From 89512dae37f00ce2c52a181b49e90477fbcac012 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 06:54:34 -0500
Subject: [PATCH 07/32] updated docs and code:

---
 .../dask/ensemble/randomforestclassifier.py   |   1 -
 python/cuml/ensemble/randomforest_common.pyx  | 431 ++++++++++++++++++
 .../cuml/ensemble/randomforestclassifier.pyx  | 292 +-----------
 .../cuml/ensemble/randomforestregressor.pyx   | 263 +----------
 4 files changed, 464 insertions(+), 523 deletions(-)
 create mode 100644 python/cuml/ensemble/randomforest_common.pyx

diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index d74592e484..27973f219e 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -298,7 +298,6 @@ def predict(self, X, output_class=True, algo='auto', threshold=0.5,
                 self.predict_using_fil(X, output_class=output_class,
                                        algo=algo,
                                        threshold=threshold,
-                                       num_classes=self.num_classes,
                                        convert_dtype=convert_dtype,
                                        predict_model="GPU",
                                        fil_sparse_format=fil_sparse_format,
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
new file mode 100644
index 0000000000..0801370f9e
--- /dev/null
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -0,0 +1,431 @@
+#
+# Copyright (c) 2020, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import cupy as cp
+import math
+import warnings
+
+import numpy as np
+from cuml import ForestInference
+from cuml.fil.fil import TreeliteModel
+from cuml.common.handle import Handle
+from cuml.common.base import Base
+from cuml.common.array import CumlArray
+
+from cython.operator cimport dereference as deref
+
+from cuml.ensemble.randomforest_shared cimport *
+from cuml.common import input_to_cuml_array, rmm_cupy_ary
+
+cimport cython
+
+# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff
+class BaseRandomForestModel(Base):
+    variables = ['n_estimators', 'max_depth', 'handle',
+                 'max_features', 'n_bins',
+                 'split_algo', 'split_criterion', 'min_rows_per_node',
+                 'min_impurity_decrease',
+                 'bootstrap', 'bootstrap_features',
+                 'verbose', 'rows_sample',
+                 'max_leaves', 'quantile_per_tree']
+
+    def _create_model(self, model, seed, split_criterion,
+                      n_streams, n_estimators=100,
+                      max_depth=16, handle=None, max_features='auto',
+                      n_bins=8, split_algo=1, bootstrap=True,
+                      bootstrap_features=False,
+                      verbose=False, min_rows_per_node=2,
+                      rows_sample=1.0, max_leaves=-1,
+                      accuracy_metric=None, dtype=None,
+                      output_type=None, min_samples_leaf=None,
+                      min_weight_fraction_leaf=None, n_jobs=None,
+                      max_leaf_nodes=None, min_impurity_decrease=0.0,
+                      min_impurity_split=None, oob_score=None,
+                      random_state=None, warm_start=None, class_weight=None,
+                      quantile_per_tree=False, criterion=None):
+
+        if accuracy_metric:
+            model.variables.append('accuracy_metric')
+        sklearn_params = {"criterion": criterion,
+                          "min_samples_leaf": min_samples_leaf,
+                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
+                          "max_leaf_nodes": max_leaf_nodes,
+                          "min_impurity_split": min_impurity_split,
+                          "oob_score": oob_score, "n_jobs": n_jobs,
+                          "random_state": random_state,
+                          "warm_start": warm_start,
+                          "class_weight": class_weight}
+
+        for key, vals in sklearn_params.items():
+            if vals is not None:
+                raise TypeError(" The Scikit-learn variable ", key,
+                                " is not supported in cuML,"
+                                " please read the cuML documentation for"
+                                " more information")
+
+        if handle is None:
+            handle = Handle(n_streams)
+
+        super(model, self).__init__(handle=handle,
+                                    verbose=verbose,
+                                    output_type=output_type)
+        if max_depth < 0:
+            raise ValueError("Must specify max_depth >0 ")
+
+        self.split_algo = split_algo
+        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
+                          '3': MAE, '4': CRITERION_END}
+        if str(split_criterion) not in criterion_dict.keys():
+            warnings.warn("The split criterion chosen was not present"
+                          " in the list of options accepted by the model"
+                          " and so the CRITERION_END option has been chosen.")
+            self.split_criterion = CRITERION_END
+        else:
+            self.split_criterion = criterion_dict[str(split_criterion)]
+
+        self.min_rows_per_node = min_rows_per_node
+        self.min_impurity_decrease = min_impurity_decrease
+        self.bootstrap_features = bootstrap_features
+        self.rows_sample = rows_sample
+        self.max_leaves = max_leaves
+        self.n_estimators = n_estimators
+        self.max_depth = max_depth
+        self.max_features = max_features
+        self.bootstrap = bootstrap
+        self.verbose = verbose
+        self.n_bins = n_bins
+        self.n_cols = None
+        self.dtype = dtype
+        self.accuracy_metric = accuracy_metric
+        self.quantile_per_tree = quantile_per_tree
+        self.n_streams = handle.getNumInternalStreams()
+        self.seed = seed
+        self.rf_forest = 0
+        self.rf_forest64 = 0
+        self.model_pbuf_bytes = bytearray()
+        self.treelite_handle = None
+
+    def _get_max_feat_val(self):
+        if type(self.max_features) == int:
+            return self.max_features/self.n_cols
+        elif type(self.max_features) == float:
+            return self.max_features
+        elif self.max_features == 'sqrt':
+            return 1/np.sqrt(self.n_cols)
+        elif self.max_features == 'log2':
+            return math.log2(self.n_cols)/self.n_cols
+        elif self.max_features == 'auto':
+            if self.RF_type == CLASSIFICATION:
+                return 1/np.sqrt(self.n_cols)
+            else:
+                return 1.0
+        else:
+            raise ValueError("Wrong value passed in for max_features"
+                             " please read the documentation")
+    
+    def _get_protobuf_bytes(self):
+        """
+        Returns the self.model_pbuf_bytes.
+        Cuml RF model gets converted to treelite protobuf bytes by:
+            1. converting the cuml RF model to a treelite model. The treelite
+            models handle (pointer) is returned
+            2. The treelite model handle is used to convert the treelite model
+            to a treelite protobuf model which is stored in a temporary file.
+            The protobuf model information is read from the temporary file and
+            the byte information is returned.
+        The treelite handle is stored `self.treelite_handle` and the treelite
+        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
+        of information is already present in the model then the respective
+        step is skipped.
+        """
+        if self.model_pbuf_bytes:
+            return self.model_pbuf_bytes
+        elif self.treelite_handle:
+            fit_mod_ptr = self.treelite_handle
+        else:
+            fit_mod_ptr = self._obtain_treelite_handle()
+        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+        return self.model_pbuf_bytes
+
+
+    def _obtain_treelite_handle(self):
+        if self.treelite_handle:
+            print(" treelite handle in obt : ", self.treelite_handle)
+            return self.treelite_handle # Use cached version
+        cdef ModelHandle cuml_model_ptr = NULL
+        cdef unsigned char[::1] model_pbuf_mv
+        cdef vector[unsigned char] model_pbuf_vec
+        if self.model_pbuf_bytes:
+            model_pbuf_mv = self.model_pbuf_bytes
+            with cython.boundscheck(False):
+                model_pbuf_vec.assign(& model_pbuf_mv[0],
+                                      & model_pbuf_mv[model_pbuf_mv.shape[0]])
+        else:
+            model_pbuf_vec = <vector[unsigned char]&> bytearray()
+        if self.RF_type == CLASSIFICATION:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                model_pbuf_vec)
+        else:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> REGRESSION_MODEL,
+                model_pbuf_vec)
+
+        mod_ptr = <uintptr_t> cuml_model_ptr
+        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
+        return self.treelite_handle
+    
+
+    def _dataset_setup(self, X, y, convert_dtype):
+        self._set_output_type(X)
+
+        # Reset the old tree data for new fit call
+        self._reset_forest_data()
+
+        X_m, self.n_rows, self.n_cols, self.dtype = \
+            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
+                                order='F')
+        if self.n_bins > self.n_rows:
+            raise ValueError("The number of bins,`n_bins` can not be greater"
+                             " than the number of samples used for training.")
+        if self.RF_type == CLASSIFICATION:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y, check_dtype=np.int32,
+                                    convert_to_dtype=(np.int32 if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+            if y_dtype != np.int32:
+                raise TypeError("The labels `y` need to be of dtype `np.int32`")
+            unique_labels = rmm_cupy_ary(cp.unique, y_m)
+            self.num_classes = len(unique_labels)
+            for i in range(self.num_classes):
+                if i not in unique_labels:
+                    raise ValueError("The labels need "
+                                     "to be consecutive values from "
+                                     "0 to the number of unique label values")
+        else:
+            y_m, _, _, y_dtype = \
+                input_to_cuml_array(y,
+                                    convert_to_dtype=(self.dtype if convert_dtype
+                                                      else None),
+                                    check_rows=self.n_rows, check_cols=1)
+
+        if self.dtype == np.float64:
+            warnings.warn("To use GPU-based prediction, first train using \
+                          float 32 data to fit the estimator.")
+
+        max_feature_val = self._get_max_feat_val()
+        if type(self.min_rows_per_node) == float:
+            self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows)
+        del X
+        del y
+        return X_m, y_m, max_feature_val
+    
+
+    def _tl_model_handles(self, model_bytes):
+        cdef ModelHandle cuml_model_ptr = NULL
+        if self.RF_type == CLASSIFICATION:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> self.num_classes,
+                <vector[unsigned char] &> model_bytes)
+        else:
+            build_treelite_forest(
+                & cuml_model_ptr,
+                <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
+                <int> self.n_cols,
+                <int> REGRESSION_MODEL,
+                <vector[unsigned char] &> model_bytes)
+        mod_handle = <uintptr_t> cuml_model_ptr
+
+        return ctypes.c_void_p(mod_handle).value
+
+
+    def _concatenate_treelite_handle(self, treelite_handle):
+        cdef ModelHandle concat_model_handle = NULL
+        cdef vector[ModelHandle] *model_handles \
+            = new vector[ModelHandle]()
+        cdef uintptr_t mod_ptr
+        for i in treelite_handle:
+            mod_ptr = <uintptr_t>i
+            model_handles.push_back((
+                <ModelHandle> mod_ptr))
+
+        self._reset_forest_data()
+        concat_model_handle = concatenate_trees(deref(model_handles))
+        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
+        self.treelite_handle = concat_model_ptr
+        cdef vector[unsigned char] pbuf_mod_info = \
+            save_model(<ModelHandle> concat_model_ptr)
+        cdef unsigned char[::1] pbuf_mod_view = \
+            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
+        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
+
+        # Fix up some instance variables that should match the new TL model
+        tl_model = TreeliteModel.from_treelite_model_handle(
+            self.treelite_handle,
+            take_handle_ownership=False)
+        self.n_cols = tl_model.num_features
+        self.n_estimators = tl_model.num_trees
+
+        return self
+
+    
+    def _predict_model_on_gpu(self, X, algo, convert_dtype,
+                              fil_sparse_format, threshold=0.5,
+                              output_class=False, predict_proba=False):
+        out_type = self._get_output_type(X)
+        cdef ModelHandle cuml_model_ptr = NULL
+        _, n_rows, n_cols, dtype = \
+            input_to_cuml_array(X, order='F',
+                                check_cols=self.n_cols)
+
+        if dtype == np.float64 and not convert_dtype:
+            raise TypeError("GPU based predict only accepts np.float32 data. \
+                            Please set convert_dtype=True to convert the test \
+                            data to the same dtype as the data used to train, \
+                            ie. np.float32. If you would like to use test \
+                            data of dtype=np.float64 please set \
+                            predict_model='CPU' to use the CPU implementation \
+                            of predict.")
+
+        treelite_handle = self._obtain_treelite_handle()
+
+        storage_type = \
+            _check_fil_parameter_validity(depth=self.max_depth,
+                                          fil_sparse_format=fil_sparse_format,
+                                          algo=algo)
+        fil_model = ForestInference()
+        tl_to_fil_model = \
+            fil_model.load_using_treelite_handle(treelite_handle,
+                                                 output_class=output_class,
+                                                 threshold=threshold,
+                                                 algo=algo,
+                                                 storage_type=storage_type)
+
+        preds = tl_to_fil_model.predict(X, output_type=out_type,
+                                        predict_proba=predict_proba)
+        return preds
+    
+    def _get_params(self, model, deep):
+        params = dict()
+        for key in model.variables:
+            if key in ['handle']:
+                continue
+            var_value = getattr(self, key, None)
+            params[key] = var_value
+        return params
+
+    def _set_params(self, model, **params):
+        self.handle.__setstate__(self.n_streams)
+        self.model_pbuf_bytes = []
+
+        if not params:
+            return self
+        for key, value in params.items():
+            if key not in model.variables:
+                raise ValueError('Invalid parameter for estimator')
+            else:
+                setattr(self, key, value)
+        return self
+    
+
+def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
+    storage_format = _check_fil_sparse_format_value(fil_sparse_format)
+    if (depth > 16 and (storage_format == 'dense' or
+                        algo == 'tree_reorg' or
+                        algo == 'batch_tree_reorg')):
+        raise ValueError("While creating a forest with max_depth greater "
+                         "than 16, `fil_sparse_format` should be True. "
+                         "If `fil_sparse_format=False` then the memory"
+                         "consumed while creating the FIL forest is very "
+                         "large and the process will be aborted. In "
+                         "addition, `algo` must be either set to `naive' "
+                         "or `auto` to set 'fil_sparse_format=True`.")
+    return storage_format
+
+
+def _check_fil_sparse_format_value(fil_sparse_format):
+    accepted_vals = [True, False, 'auto']
+    if fil_sparse_format == 'auto':
+        storage_format = fil_sparse_format
+    elif not fil_sparse_format:
+        storage_format = 'dense'
+    elif fil_sparse_format not in accepted_vals:
+        raise ValueError("The value entered for spares_forest is not "
+                         "supported. Please refer to the documentation "
+                         "to see the accepted values.")
+    else:
+        storage_format = 'sparse'
+    return storage_format
+
+
+def _obtain_treelite_model(treelite_handle):
+    """
+    Creates a Treelite model using the treelite handle
+    obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    tl_to_fil_model : Treelite version of this model
+    """
+    treelite_model = \
+        TreeliteModel.from_treelite_model_handle(treelite_handle)
+    return treelite_model
+
+
+def _obtain_fil_model(treelite_handle, depth,
+                      output_class=True,
+                      threshold=0.5, algo='auto',
+                      fil_sparse_format='auto'):
+    """
+    Creates a Forest Inference (FIL) model using the treelite
+    handle obtained from the cuML Random Forest model.
+
+    Returns
+    ----------
+    fil_model :
+        A Forest Inference model which can be used to perform
+        inferencing on the random forest model.
+    """
+    storage_format = \
+        _check_fil_parameter_validity(depth=depth,
+                                      fil_sparse_format=fil_sparse_format,
+                                      algo=algo)
+
+    fil_model = ForestInference()
+    tl_to_fil_model = \
+        fil_model.load_using_treelite_handle(treelite_handle,
+                                             output_class=output_class,
+                                             threshold=threshold,
+                                             algo=algo,
+                                             storage_type=storage_format)
+
+    return tl_to_fil_model
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index cc3a915edc..ed23fc4ce4 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -41,6 +41,8 @@ from cuml.common.handle import Handle
 from cuml.common.handle cimport cumlHandle
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
     _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
+from cuml.ensemble.randomforest_common import BaseRandomForestModel
+
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.fil.fil import TreeliteModel
 from cuml.common import input_to_cuml_array, rmm_cupy_ary
@@ -123,7 +125,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                           bool) except +
 
 
-class RandomForestClassifier(Base):
+class RandomForestClassifier(BaseRandomForestModel):
     """
     Implements a Random Forest classifier model which fits multiple decision
     tree classifiers in an ensemble.
@@ -231,81 +233,20 @@ class RandomForestClassifier(Base):
                  'rows_sample',
                  'max_leaves', 'quantile_per_tree']
 
-    def __init__(self, n_estimators=100, max_depth=16, handle=None,
-                 max_features='auto', n_bins=8, n_streams=8,
-                 split_algo=1, split_criterion=0, min_rows_per_node=2,
-                 bootstrap=True, bootstrap_features=False,
-                 type_model="classifier", verbose=False,
-                 rows_sample=1.0, max_leaves=-1, quantile_per_tree=False,
-                 output_type=None, criterion=None, dtype=None,
-                 min_samples_leaf=None, min_weight_fraction_leaf=None,
-                 max_leaf_nodes=None, min_impurity_decrease=0.0,
-                 min_impurity_split=None, oob_score=None, n_jobs=None,
-                 random_state=None, warm_start=None, class_weight=None,
-                 seed=None):
-        sklearn_params = {"criterion": criterion,
-                          "min_samples_leaf": min_samples_leaf,
-                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
-                          "max_leaf_nodes": max_leaf_nodes,
-                          "min_impurity_split": min_impurity_split,
-                          "oob_score": oob_score, "n_jobs": n_jobs,
-                          "random_state": random_state,
-                          "warm_start": warm_start,
-                          "class_weight": class_weight}
-
-        for key, vals in sklearn_params.items():
-            if vals is not None:
-                raise TypeError("The Scikit-learn variable", key,
-                                " is not supported in cuML,"
-                                " please read the cuML documentation for"
-                                " more information")
-
-        if max_depth < 0:
-            raise ValueError("Must specify max_depth >0")
-
-        if handle is None:
-            handle = Handle(n_streams)
-
-        super(RandomForestClassifier, self).__init__(handle=handle,
-                                                     verbose=verbose,
-                                                     output_type=output_type)
-
-        self.split_algo = split_algo
-        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                          '3': MAE, '4': CRITERION_END}
-        if str(split_criterion) not in criterion_dict.keys():
-            warnings.warn("The split criterion chosen was not present"
-                          " in the list of options accepted by the model"
-                          " and so the CRITERION_END option has been chosen.")
-            self.split_criterion = CRITERION_END
-        else:
-            self.split_criterion = criterion_dict[str(split_criterion)]
-
-        self.min_rows_per_node = min_rows_per_node
-        self.min_impurity_decrease = min_impurity_decrease
-        self.bootstrap_features = bootstrap_features
-        self.rows_sample = rows_sample
-        self.max_leaves = max_leaves
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.max_features = max_features
-        self.bootstrap = bootstrap
-        self.treelite_handle = None
-        self.n_bins = n_bins
-        self.quantile_per_tree = quantile_per_tree
-        self.n_cols = None
-        self.dtype = None
-        self.n_streams = handle.getNumInternalStreams()
-        self.seed = seed
-        self.num_classes = 2
+    def __init__(self, split_criterion=0, seed=None,
+                 n_streams=8, **kwargs):
         if ((seed is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results, n_streams==1 is "
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "
                           "random_seed is set")
-        self.rf_forest = 0
-        self.rf_forest64 = 0
-        self.model_pbuf_bytes = bytearray()
+
+        self.RF_type = CLASSIFICATION
+        self.num_classes = 2
+        self._create_model(model=RandomForestClassifier,
+                      split_criterion=split_criterion,
+                      seed=seed, n_streams=n_streams,
+                      **kwargs)
 
     """
     TODO:
@@ -385,86 +326,6 @@ class RandomForestClassifier(Base):
         self.model_pbuf_bytes = bytearray()
         self.n_cols = None
 
-    def _get_max_feat_val(self):
-        if type(self.max_features) == int:
-            return self.max_features/self.n_cols
-        elif type(self.max_features) == float:
-            return self.max_features
-        elif self.max_features == 'sqrt' or self.max_features == 'auto':
-            return 1/np.sqrt(self.n_cols)
-        elif self.max_features == 'log2':
-            return math.log2(self.n_cols)/self.n_cols
-        else:
-            raise ValueError("Wrong value passed in for max_features"
-                             " please read the documentation")
-
-    def _obtain_treelite_handle(self):
-        """Returns a handle to a treelite-formatted version of the model.
-        This will create a new treelite model if necessary, or return
-        a cached version when available. The handle is cached in the
-        instanced and freed at instance deletion. Caller should not
-        delete the returned model."""
-        if self.treelite_handle is not None:
-            return self.treelite_handle  # Cached version
-
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-
-        assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \
-            "Attempting to create treelite from un-fit forest."
-
-        if self.num_classes > 2:
-            raise NotImplementedError("Pickling for multi-class "
-                                      "classification models is currently not "
-                                      "implemented. Please check cuml issue "
-                                      "#1679 for more information.")
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-
-        task_category = CLASSIFICATION_MODEL
-        build_treelite_forest(
-            & cuml_model_ptr,
-            rf_forest,
-            <int> self.n_cols,
-            <int> task_category,
-            model_pbuf_vec)
-        mod_ptr = <uintptr_t> cuml_model_ptr
-        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return self.treelite_handle
-
-    def _get_protobuf_bytes(self):
-        """
-        Returns the self.model_pbuf_bytes.
-        Cuml RF model gets converted to treelite protobuf bytes by:
-            1. converting the cuml RF model to a treelite model. The treelite
-            models handle (pointer) is returned
-            2. The treelite model handle is used to convert the treelite model
-            to a treelite protobuf model which is stored in a temporary file.
-            The protobuf model information is read from the temporary file and
-            the byte information is returned.
-        The treelite handle is stored `self.treelite_handle` and the treelite
-        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
-        of information is already present in the model then the respective
-        step is skipped.
-        """
-        if self.model_pbuf_bytes:
-            return self.model_pbuf_bytes
-        elif self.treelite_handle:
-            fit_mod_ptr = self.treelite_handle
-        else:
-            fit_mod_ptr = self._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self.model_pbuf_bytes
-
     def convert_to_treelite_model(self):
         """
         Converts the cuML RF model to a Treelite model
@@ -534,49 +395,6 @@ class RandomForestClassifier(Base):
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
     """
-    def _tl_model_handles(self, model_bytes):
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, int] *rf_forest = \
-            <RandomForestMetaData[float, int]*><uintptr_t> self.rf_forest
-        task_category = CLASSIFICATION_MODEL
-        build_treelite_forest(& cuml_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> model_bytes)
-        mod_handle = <uintptr_t> cuml_model_ptr
-
-        return ctypes.c_void_p(mod_handle).value
-
-    def _concatenate_treelite_handle(self, treelite_handle):
-        cdef ModelHandle concat_model_handle = NULL
-        cdef vector[ModelHandle] *model_handles \
-            = new vector[ModelHandle]()
-        cdef uintptr_t mod_ptr
-        for i in treelite_handle:
-            mod_ptr = <uintptr_t>i
-            model_handles.push_back((
-                <ModelHandle> mod_ptr))
-
-        self._reset_forest_data()
-        concat_model_handle = concatenate_trees(deref(model_handles))
-        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
-        self.treelite_handle = concat_model_ptr
-
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> concat_model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-
-        # Fix up some instance variables that should match the new TL model
-        tl_model = TreeliteModel.from_treelite_model_handle(
-            self.treelite_handle,
-            take_handle_ownership=False)
-        self.n_cols = tl_model.num_features
-        self.n_estimators = tl_model.num_trees
-
-        return self
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -599,50 +417,16 @@ class RandomForestClassifier(Base):
             memory used for the method.
 
         """
-        self._set_output_type(X)
-
-        # Reset the old tree data for new fit call
-        self._reset_forest_data()
-
+        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
         cdef uintptr_t X_ptr, y_ptr
-
-        X_m, n_rows, self.n_cols, self.dtype = \
-            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
-                                order='F')
-        if self.n_bins > n_rows:
-            raise ValueError("The number of bins,`n_bins` can not be greater"
-                             " than the number of samples used for training.")
         X_ptr = X_m.ptr
-
-        y_m, _, _, y_dtype = \
-            input_to_cuml_array(y, check_dtype=np.int32,
-                                convert_to_dtype=(np.int32 if convert_dtype
-                                                  else None),
-                                check_rows=n_rows, check_cols=1)
         y_ptr = y_m.ptr
-        if y_dtype != np.int32:
-            raise TypeError("The labels `y` need to be of dtype `np.int32`")
-
-        if self.dtype == np.float64:
-            warnings.warn("To use GPU-based prediction, first train \
-                          using float 32 data to fit the estimator.")
-
         cdef cumlHandle* handle_ =\
             <cumlHandle*><uintptr_t>self.handle.getHandle()
 
         unique_labels = rmm_cupy_ary(cp.unique, y_m)
         num_unique_labels = len(unique_labels)
 
-        for i in range(num_unique_labels):
-            if i not in unique_labels:
-                raise ValueError("The labels need "
-                                 "to be consecutive values from "
-                                 "0 to the number of unique label values")
-
-        max_feature_val = self._get_max_feat_val()
-        if type(self.min_rows_per_node) == float:
-            self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows)
-
         cdef RandomForestMetaData[float, int] *rf_forest = \
             new RandomForestMetaData[float, int]()
         self.rf_forest = <uintptr_t> rf_forest
@@ -675,7 +459,7 @@ class RandomForestClassifier(Base):
             fit(handle_[0],
                 rf_forest,
                 <float*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <int*> y_ptr,
                 <int> num_unique_labels,
@@ -687,7 +471,7 @@ class RandomForestClassifier(Base):
             fit(handle_[0],
                 rf_forest64,
                 <double*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <int*> y_ptr,
                 <int> num_unique_labels,
@@ -706,43 +490,6 @@ class RandomForestClassifier(Base):
         self.num_classes = num_unique_labels
         return self
 
-    def _predict_model_on_gpu(self, X, output_class,
-                              threshold, algo,
-                              num_classes, convert_dtype,
-                              fil_sparse_format, predict_proba):
-        out_type = self._get_output_type(X)
-        cdef ModelHandle cuml_model_ptr = NULL
-        _, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='F',
-                                check_cols=self.n_cols)
-
-        if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            Please set convert_dtype=True to convert the test \
-                            data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
-                            predict_model='CPU' to use the CPU implementation \
-                            of predict.")
-
-        treelite_handle = self._obtain_treelite_handle()
-
-        storage_type = \
-            _check_fil_parameter_validity(depth=self.max_depth,
-                                          fil_sparse_format=fil_sparse_format,
-                                          algo=algo)
-        fil_model = ForestInference()
-        tl_to_fil_model = \
-            fil_model.load_using_treelite_handle(treelite_handle,
-                                                 output_class=output_class,
-                                                 threshold=threshold,
-                                                 algo=algo,
-                                                 storage_type=storage_type)
-
-        preds = tl_to_fil_model.predict(X, output_type=out_type,
-                                        predict_proba=predict_proba)
-        return preds
-
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr
@@ -794,7 +541,7 @@ class RandomForestClassifier(Base):
     def predict(self, X, predict_model="GPU",
                 output_class=True, threshold=0.5,
                 algo='auto',
-                num_classes=2, convert_dtype=True,
+                convert_dtype=True,
                 fil_sparse_format='auto'):
         """
         Predicts the labels for X.
@@ -832,7 +579,8 @@ class RandomForestClassifier(Base):
             while performing the predict operation on the GPU.
             It is applied if output_class == True, else it is ignored
         num_classes : int (default = 2)
-            number of different classes present in the dataset
+            number of different classes present in the dataset. This variable
+            will be depricated in 0.16
         convert_dtype : bool, optional (default = True)
             When set to True, the predict method will, when necessary, convert
             the input to the data type which was used to train the model. This
@@ -872,7 +620,6 @@ class RandomForestClassifier(Base):
                 self._predict_model_on_gpu(X, output_class=output_class,
                                            threshold=threshold,
                                            algo=algo,
-                                           num_classes=num_classes,
                                            convert_dtype=convert_dtype,
                                            fil_sparse_format=fil_sparse_format,
                                            predict_proba=False)
@@ -941,7 +688,7 @@ class RandomForestClassifier(Base):
 
     def predict_proba(self, X, output_class=True,
                       threshold=0.5, algo='auto',
-                      num_classes=2, convert_dtype=True,
+                      convert_dtype=True,
                       fil_sparse_format='auto'):
         """
         Predicts class probabilites for X. This function uses the GPU
@@ -1018,7 +765,6 @@ class RandomForestClassifier(Base):
             self._predict_model_on_gpu(X, output_class=output_class,
                                        threshold=threshold,
                                        algo=algo,
-                                       num_classes=num_classes,
                                        convert_dtype=convert_dtype,
                                        fil_sparse_format=fil_sparse_format,
                                        predict_proba=True)
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index dc0a4f85f8..f684f69ccc 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -37,6 +37,7 @@ from cuml.common.handle import Handle
 from cuml.common.handle cimport cumlHandle
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
     _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
+from cuml.ensemble.randomforest_common import BaseRandomForestModel
 
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.fil.fil import TreeliteModel
@@ -103,7 +104,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                           int) except +
 
 
-class RandomForestRegressor(Base):
+class RandomForestRegressor(BaseRandomForestModel):
 
     """
     Implements a Random Forest regressor model which fits multiple decision
@@ -218,81 +219,15 @@ class RandomForestRegressor(Base):
                  'max_leaves', 'quantile_per_tree',
                  'accuracy_metric']
 
-    def __init__(self, n_estimators=100, max_depth=16, handle=None,
-                 max_features='auto', n_bins=8, n_streams=8,
-                 split_algo=1, split_criterion=2,
-                 bootstrap=True, bootstrap_features=False,
-                 verbose=False, min_rows_per_node=2,
-                 rows_sample=1.0, max_leaves=-1,
-                 accuracy_metric='mse', output_type=None,
-                 min_samples_leaf=None, dtype=None,
-                 min_weight_fraction_leaf=None, n_jobs=None,
-                 max_leaf_nodes=None, min_impurity_decrease=0.0,
-                 min_impurity_split=None, oob_score=None,
-                 random_state=None, warm_start=None, class_weight=None,
-                 quantile_per_tree=False, criterion=None, seed=None):
-        sklearn_params = {"criterion": criterion,
-                          "min_samples_leaf": min_samples_leaf,
-                          "min_weight_fraction_leaf": min_weight_fraction_leaf,
-                          "max_leaf_nodes": max_leaf_nodes,
-                          "min_impurity_split": min_impurity_split,
-                          "oob_score": oob_score, "n_jobs": n_jobs,
-                          "random_state": random_state,
-                          "warm_start": warm_start,
-                          "class_weight": class_weight}
-
-        for key, vals in sklearn_params.items():
-            if vals is not None:
-                raise TypeError(" The Scikit-learn variable ", key,
-                                " is not supported in cuML,"
-                                " please read the cuML documentation for"
-                                " more information")
-
-        if handle is None:
-            handle = Handle(n_streams)
-
-        super(RandomForestRegressor, self).__init__(handle=handle,
-                                                    verbose=verbose,
-                                                    output_type=output_type)
-
-        if max_depth < 0:
-            raise ValueError("Must specify max_depth >0 ")
-
-        self.split_algo = split_algo
-        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                          '3': MAE, '4': CRITERION_END}
-        if str(split_criterion) not in criterion_dict.keys():
-            warnings.warn("The split criterion chosen was not present"
-                          " in the list of options accepted by the model"
-                          " and so the CRITERION_END option has been chosen.")
-            self.split_criterion = CRITERION_END
-        else:
-            self.split_criterion = criterion_dict[str(split_criterion)]
-
-        self.min_rows_per_node = min_rows_per_node
-        self.min_impurity_decrease = min_impurity_decrease
-        self.bootstrap_features = bootstrap_features
-        self.rows_sample = rows_sample
-        self.max_leaves = max_leaves
-        self.n_estimators = n_estimators
-        self.max_depth = max_depth
-        self.max_features = max_features
-        self.bootstrap = bootstrap
-        self.n_bins = n_bins
-        self.n_cols = None
-        self.dtype = None
-        self.treelite_handle = None
-        self.accuracy_metric = accuracy_metric
-        self.quantile_per_tree = quantile_per_tree
-        self.n_streams = handle.getNumInternalStreams()
-        self.seed = seed
-        if ((seed is not None) and (n_streams != 1)):
-            warnings.warn("Setting the random seed does not fully guarantee"
-                          " the exact same results at this time.")
-        self.rf_forest = None
-        self.rf_forest64 = None
-        self.model_pbuf_bytes = bytearray()
-
+    def __init__(self, split_criterion=2, seed=None,
+                 accuracy_metric='mse', n_streams=8,
+                 **kwargs):
+        self.RF_type = REGRESSION
+        self._create_model(model=RandomForestRegressor,
+                      split_criterion=split_criterion,
+                      seed=seed, n_streams=n_streams,
+                      accuracy_metric=accuracy_metric,
+                      **kwargs)
     """
     TODO:
         Add the preprocess and postprocess functions
@@ -369,82 +304,6 @@ class RandomForestRegressor(Base):
         self.model_pbuf_bytes = bytearray()
         self.n_cols = None
 
-    def _get_max_feat_val(self):
-        if type(self.max_features) == int:
-            return self.max_features/self.n_cols
-        elif type(self.max_features) == float:
-            return self.max_features
-        elif self.max_features == 'sqrt':
-            return 1/np.sqrt(self.n_cols)
-        elif self.max_features == 'auto':
-            return 1.0
-        elif self.max_features == 'log2':
-            return math.log2(self.n_cols)/self.n_cols
-        else:
-            raise ValueError("Wrong value passed in for max_features"
-                             " please read the documentation")
-
-    def _obtain_treelite_handle(self):
-        """Returns a handle to a treelite-formatted version of the model.
-        This will create a new treelite model if necessary, or return
-        a cached version when available. The handle is cached in the
-        instanced and freed at instance deletion. Caller should not
-        delete the returned model."""
-        if self.treelite_handle is not None:
-            return self.treelite_handle  # Cached version
-
-        cdef ModelHandle cuml_model_ptr = NULL
-        cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
-        assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \
-            "Attempting to create treelite from un-fit forest."
-
-        cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes
-        cdef vector[unsigned char] model_pbuf_vec
-        with cython.boundscheck(False):
-            model_pbuf_vec.assign(& model_pbuf_mv[0],
-                                  & model_pbuf_mv[model_pbuf_mv.shape[0]])
-
-        task_category = REGRESSION_MODEL
-        build_treelite_forest(
-            & cuml_model_ptr,
-            rf_forest,
-            <int> self.n_cols,
-            <int> task_category,
-            model_pbuf_vec)
-        mod_ptr = <uintptr_t> cuml_model_ptr
-        self.treelite_handle = ctypes.c_void_p(mod_ptr).value
-        return self.treelite_handle
-
-    def _get_protobuf_bytes(self):
-        """
-        Returns the self.model_pbuf_bytes.
-        Cuml RF model gets converted to treelite protobuf bytes by:
-            1. converting the cuml RF model to a treelite model. The treelite
-            models handle (pointer) is returned
-            2. The treelite model handle is used to convert the treelite model
-            to a treelite protobuf model which is stored in a temporary file.
-            The protobuf model information is read from the temporary file and
-            the byte information is returned.
-        The treelite handle is stored `self.treelite_handle` and the treelite
-        protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
-        of information is already present in the model then the respective
-        step is skipped.
-        """
-        if self.model_pbuf_bytes:
-            return self.model_pbuf_bytes
-        elif self.treelite_handle:
-            fit_mod_ptr = self.treelite_handle
-        else:
-            fit_mod_ptr = self._obtain_treelite_handle()
-        cdef uintptr_t model_ptr = <uintptr_t> fit_mod_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-        return self.model_pbuf_bytes
-
     def convert_to_treelite_model(self):
         """
         Converts the cuML RF model to a Treelite model
@@ -511,48 +370,6 @@ class RandomForestRegressor(Base):
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
     """
-    def _tl_model_handles(self, model_bytes):
-        task_category = REGRESSION_MODEL
-        cdef ModelHandle tl_model_ptr = NULL
-        cdef RandomForestMetaData[float, float] *rf_forest = \
-            <RandomForestMetaData[float, float]*><uintptr_t> self.rf_forest
-        build_treelite_forest(& tl_model_ptr,
-                              rf_forest,
-                              <int> self.n_cols,
-                              <int> task_category,
-                              <vector[unsigned char] &> model_bytes)
-        mod_handle = <uintptr_t> tl_model_ptr
-
-        return ctypes.c_void_p(mod_handle).value
-
-    def _concatenate_treelite_handle(self, treelite_handle):
-        cdef ModelHandle concat_model_handle = NULL
-        cdef vector[ModelHandle] *model_handles \
-            = new vector[ModelHandle]()
-        cdef uintptr_t mod_ptr
-        for i in treelite_handle:
-            mod_ptr = <uintptr_t>i
-            model_handles.push_back((
-                <ModelHandle> mod_ptr))
-
-        self._reset_forest_data()
-        concat_model_handle = concatenate_trees(deref(model_handles))
-        cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
-        self.treelite_handle = concat_model_ptr
-        cdef vector[unsigned char] pbuf_mod_info = \
-            save_model(<ModelHandle> concat_model_ptr)
-        cdef unsigned char[::1] pbuf_mod_view = \
-            <unsigned char[:pbuf_mod_info.size():1]>pbuf_mod_info.data()
-        self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
-
-        # Fix up some instance variables that should match the new TL model
-        tl_model = TreeliteModel.from_treelite_model_handle(
-            self.treelite_handle,
-            take_handle_ownership=False)
-        self.n_cols = tl_model.num_features
-        self.n_estimators = tl_model.num_trees
-
-        return self
 
     def fit(self, X, y, convert_dtype=False):
         """
@@ -570,25 +387,11 @@ class RandomForestRegressor(Base):
             ndarray, cuda array interface compliant array like CuPy
             These labels should be contiguous integers from 0 to n_classes.
         """
-        self._set_output_type(X)
+        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
 
         # Reset the old tree data for new fit call
-        self._reset_forest_data()
-
         cdef uintptr_t X_ptr, y_ptr
-
-        X_m, n_rows, self.n_cols, self.dtype = \
-            input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
-                                order='F')
-        if self.n_bins > n_rows:
-            raise ValueError("The number of bins,`n_bins` can not be greater"
-                             " than the number of samples used for training.")
         X_ptr = X_m.ptr
-        y_m, _, _, y_dtype = \
-            input_to_cuml_array(y,
-                                convert_to_dtype=(self.dtype if convert_dtype
-                                                  else None),
-                                check_rows=n_rows, check_cols=1)
         y_ptr = y_m.ptr
 
         if self.dtype == np.float64:
@@ -598,10 +401,6 @@ class RandomForestRegressor(Base):
         cdef cumlHandle* handle_ =\
             <cumlHandle*><uintptr_t>self.handle.getHandle()
 
-        max_feature_val = self._get_max_feat_val()
-        if type(self.min_rows_per_node) == float:
-            self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows)
-
         cdef RandomForestMetaData[float, float] *rf_forest = \
             new RandomForestMetaData[float, float]()
         self.rf_forest = <uintptr_t> rf_forest
@@ -634,7 +433,7 @@ class RandomForestRegressor(Base):
             fit(handle_[0],
                 rf_forest,
                 <float*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <float*> y_ptr,
                 rf_params,
@@ -645,7 +444,7 @@ class RandomForestRegressor(Base):
             fit(handle_[0],
                 rf_forest64,
                 <double*> X_ptr,
-                <int> n_rows,
+                <int> self.n_rows,
                 <int> self.n_cols,
                 <double*> y_ptr,
                 rf_params64,
@@ -657,40 +456,6 @@ class RandomForestRegressor(Base):
         del(y_m)
         return self
 
-    def _predict_model_on_gpu(self, X, algo, convert_dtype,
-                              fil_sparse_format):
-        out_type = self._get_output_type(X)
-        cdef ModelHandle cuml_model_ptr = NULL
-        _, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='F',
-                                check_cols=self.n_cols)
-
-        if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
-                            Please set convert_dtype=True to convert the test \
-                            data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
-                            predict_model='CPU' to use the CPU implementation \
-                            of predict.")
-
-        treelite_handle = self._obtain_treelite_handle()
-
-        storage_type = \
-            _check_fil_parameter_validity(depth=self.max_depth,
-                                          fil_sparse_format=fil_sparse_format,
-                                          algo=algo)
-
-        fil_model = ForestInference()
-        tl_to_fil_model = \
-            fil_model.load_using_treelite_handle(treelite_handle,
-                                                 output_class=False,
-                                                 algo=algo,
-                                                 storage_type=storage_type)
-
-        preds = tl_to_fil_model.predict(X, out_type)
-        return preds
-
     def _predict_model_on_cpu(self, X, convert_dtype):
         out_type = self._get_output_type(X)
         cdef uintptr_t X_ptr

From cff16d74ab566548d4695615ad7b0c13fd722104 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 07:31:56 -0500
Subject: [PATCH 08/32] style check

---
 cpp/include/cuml/ensemble/randomforest.hpp | 2 +-
 cpp/src/randomforest/randomforest.cu       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp
index defe869347..84c951dd0c 100644
--- a/cpp/include/cuml/ensemble/randomforest.hpp
+++ b/cpp/include/cuml/ensemble/randomforest.hpp
@@ -217,4 +217,4 @@ RF_metrics score(const cumlHandle& user_handle,
                  const RandomForestRegressorD* forest, const double* ref_labels,
                  int n_rows, const double* predictions,
                  int verbosity = CUML_LEVEL_INFO);
-};  // namespace ML
\ No newline at end of file
+};  // namespace ML
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 625b1b37d4..1a17bbe0f8 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -795,4 +795,4 @@ template void build_treelite_forest<float, float>(
 template void build_treelite_forest<double, double>(
   ModelHandle* model, const RandomForestMetaData<double, double>* forest,
   int num_features, int task_category, std::vector<unsigned char>& data);
-}  // End namespace ML
\ No newline at end of file
+}  // End namespace ML

From 0fc2df19e9eff63fc44c7134760338e78a277fe8 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 09:02:35 -0500
Subject: [PATCH 09/32] fix style errors

---
 python/cuml/ensemble/randomforest_common.pyx  | 39 ++++++++++---------
 .../cuml/ensemble/randomforestclassifier.pyx  |  6 +--
 .../cuml/ensemble/randomforestregressor.pyx   | 18 ++++-----
 3 files changed, 30 insertions(+), 33 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index e91f1262cb..9275a4ff2e 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -33,7 +33,7 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 cimport cython
 
-# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff
+
 class BaseRandomForestModel(Base):
     variables = ['n_estimators', 'max_depth', 'handle',
                  'max_features', 'n_bins',
@@ -166,11 +166,9 @@ class BaseRandomForestModel(Base):
         self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view))
         return self.model_pbuf_bytes
 
-
     def _obtain_treelite_handle(self):
         if self.treelite_handle:
-            print(" treelite handle in obt : ", self.treelite_handle)
-            return self.treelite_handle # Use cached version
+            return self.treelite_handle  # Use cached version
         cdef ModelHandle cuml_model_ptr = NULL
         cdef unsigned char[::1] model_pbuf_mv
         cdef vector[unsigned char] model_pbuf_vec
@@ -180,7 +178,7 @@ class BaseRandomForestModel(Base):
                 model_pbuf_vec.assign(& model_pbuf_mv[0],
                                       & model_pbuf_mv[model_pbuf_mv.shape[0]])
         else:
-            model_pbuf_vec = <vector[unsigned char]&> bytearray()
+            model_pbuf_vec = <vector[unsigned char] &> bytearray()
         if self.RF_type == CLASSIFICATION:
             build_treelite_forest(
                 & cuml_model_ptr,
@@ -214,12 +212,14 @@ class BaseRandomForestModel(Base):
                              " than the number of samples used for training.")
         if self.RF_type == CLASSIFICATION:
             y_m, _, _, y_dtype = \
-                input_to_cuml_array(y, check_dtype=np.int32,
-                                    convert_to_dtype=(np.int32 if convert_dtype
-                                                      else None),
-                                    check_rows=self.n_rows, check_cols=1)
+                input_to_cuml_array(
+                    y, check_dtype=np.int32,
+                    convert_to_dtype=(np.int32 if convert_dtype
+                                      else None),
+                    check_rows=self.n_rows, check_cols=1)
             if y_dtype != np.int32:
-                raise TypeError("The labels `y` need to be of dtype `np.int32`")
+                raise TypeError("The labels `y` need to be of dtype"
+                                " `np.int32`")
             unique_labels = rmm_cupy_ary(cp.unique, y_m)
             self.num_classes = len(unique_labels)
             for i in range(self.num_classes):
@@ -229,10 +229,11 @@ class BaseRandomForestModel(Base):
                                      "0 to the number of unique label values")
         else:
             y_m, _, _, y_dtype = \
-                input_to_cuml_array(y,
-                                    convert_to_dtype=(self.dtype if convert_dtype
-                                                      else None),
-                                    check_rows=self.n_rows, check_cols=1)
+                input_to_cuml_array(
+                    y,
+                    convert_to_dtype=(self.dtype if convert_dtype
+                                      else None),
+                    check_rows=self.n_rows, check_cols=1)
 
         if self.dtype == np.float64:
             warnings.warn("To use GPU-based prediction, first train using \
@@ -240,11 +241,12 @@ class BaseRandomForestModel(Base):
 
         max_feature_val = self._get_max_feat_val()
         if type(self.min_rows_per_node) == float:
-            self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows)
+            self.min_rows_per_node = \
+                math.ceil(self.min_rows_per_node*self.n_rows)
         del X
         del y
         return X_m, y_m, max_feature_val
-    
+
     def _tl_model_handles(self, model_bytes):
         cdef ModelHandle cuml_model_ptr = NULL
         if self.RF_type == CLASSIFICATION:
@@ -265,7 +267,6 @@ class BaseRandomForestModel(Base):
 
         return ctypes.c_void_p(mod_handle).value
 
-
     def _concatenate_treelite_handle(self, treelite_handle):
         cdef ModelHandle concat_model_handle = NULL
         cdef vector[ModelHandle] *model_handles \
@@ -293,7 +294,6 @@ class BaseRandomForestModel(Base):
         self.n_estimators = tl_model.num_trees
         return self
 
-    
     def _predict_model_on_gpu(self, X, algo, convert_dtype,
                               fil_sparse_format, threshold=0.5,
                               output_class=False, predict_proba=False):
@@ -329,7 +329,7 @@ class BaseRandomForestModel(Base):
         preds = tl_to_fil_model.predict(X, output_type=out_type,
                                         predict_proba=predict_proba)
         return preds
-    
+
     def _get_params(self, model, deep):
         params = dict()
         for key in model.variables:
@@ -352,6 +352,7 @@ class BaseRandomForestModel(Base):
                 setattr(self, key, value)
         return self
 
+
 def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
     storage_format = _check_fil_sparse_format_value(fil_sparse_format)
     if (depth > 16 and (storage_format == 'dense' or
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 3dcd7c87b6..205fb5b0c4 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -880,8 +880,6 @@ class RandomForestClassifier(BaseRandomForestModel):
         -----------
         deep : boolean (default = True)
         """
-
-
         return self._get_params(model=RandomForestClassifier,
                                 deep=deep)
 
@@ -895,9 +893,7 @@ class RandomForestClassifier(BaseRandomForestModel):
         -----------
         params : dict of new params
         """
-        # Resetting handle as __setstate__ overwrites with handle=None
-
-
+        # Resetting handle as __setstate__ overwrites with handle=Non
         return self._set_params(model=RandomForestClassifier,
                                 **params)
 
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 5089e8afb9..2f41898113 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -217,10 +217,10 @@ class RandomForestRegressor(BaseRandomForestModel):
                  **kwargs):
         self.RF_type = REGRESSION
         self._create_model(model=RandomForestRegressor,
-                      split_criterion=split_criterion,
-                      seed=seed, n_streams=n_streams,
-                      accuracy_metric=accuracy_metric,
-                      **kwargs)
+                           split_criterion=split_criterion,
+                           seed=seed, n_streams=n_streams,
+                           accuracy_metric=accuracy_metric,
+                           **kwargs)
     """
     TODO:
         Add the preprocess and postprocess functions
@@ -356,7 +356,6 @@ class RandomForestRegressor(BaseRandomForestModel):
                                  algo=algo,
                                  fil_sparse_format=fil_sparse_format)
 
-
     """
     TODO : Move functions duplicated in the RF classifier and regressor
            to a shared file. Cuml issue #1854 has been created to track this.
@@ -549,10 +548,11 @@ class RandomForestRegressor(BaseRandomForestModel):
                             setting predict_model = 'CPU'")
 
         else:
-            preds = self._predict_model_on_gpu(X=X,
-                                               algo=algo,
-                                               convert_dtype=convert_dtype,
-                                               fil_sparse_format=fil_sparse_format)
+            preds = self._predict_model_on_gpu(
+                X=X,
+                algo=algo,
+                convert_dtype=convert_dtype,
+                fil_sparse_format=fil_sparse_format)
 
         return preds
 

From 0c02d92ba9060795c0d698c3b89a274d285a1894 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 09:14:14 -0500
Subject: [PATCH 10/32] update CHANGELOG.md

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b4d2bd61b2..b3612a036a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,7 @@
 
 - PR #2310: Pinning ucx-py to 0.14 to make 0.15 CI pass
 - PR #1945: enable clang tidy
+- PR #2237: Refactor RF cython code
 
 ## Bug Fixes
 

From 2dc70fd0b7db82e4141d23e5a98c1568124c8698 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 16:11:26 -0500
Subject: [PATCH 11/32] updated code

---
 python/cuml/ensemble/randomforest_common.pyx  | 30 ++++---
 .../cuml/ensemble/randomforestclassifier.pyx  | 85 ++++++++++---------
 .../cuml/ensemble/randomforestregressor.pyx   | 52 ++++++------
 3 files changed, 89 insertions(+), 78 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 9275a4ff2e..23d99ad492 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -43,7 +43,7 @@ class BaseRandomForestModel(Base):
                  'verbose', 'rows_sample',
                  'max_leaves', 'quantile_per_tree']
 
-    def _create_model(self, model, seed, split_criterion,
+    def _create_model(self, seed, split_criterion,
                       n_streams, n_estimators=100,
                       max_depth=16, handle=None, max_features='auto',
                       n_bins=8, split_algo=1, bootstrap=True,
@@ -59,7 +59,8 @@ class BaseRandomForestModel(Base):
                       quantile_per_tree=False, criterion=None):
 
         if accuracy_metric:
-            model.variables.append('accuracy_metric')
+            BaseRandomForestModel.variables.append('accuracy_metric')
+
         sklearn_params = {"criterion": criterion,
                           "min_samples_leaf": min_samples_leaf,
                           "min_weight_fraction_leaf": min_weight_fraction_leaf,
@@ -71,7 +72,7 @@ class BaseRandomForestModel(Base):
                           "class_weight": class_weight}
 
         for key, vals in sklearn_params.items():
-            if vals is not None:
+            if vals:
                 raise TypeError(" The Scikit-learn variable ", key,
                                 " is not supported in cuML,"
                                 " please read the cuML documentation for"
@@ -80,9 +81,11 @@ class BaseRandomForestModel(Base):
         if handle is None:
             handle = Handle(n_streams)
 
-        super(model, self).__init__(handle=handle,
-                                    verbose=verbose,
-                                    output_type=output_type)
+        super(BaseRandomForestModel, self).__init__(
+            handle=handle,
+            verbose=verbose,
+            output_type=output_type)
+
         if max_depth < 0:
             raise ValueError("Must specify max_depth >0 ")
 
@@ -152,6 +155,9 @@ class BaseRandomForestModel(Base):
         of information is already present in the model then the respective
         step is skipped.
         """
+        if self.dtype == np.float64:
+            raise TypeError("To use pickling, first train the model"
+                            " using float 32 data.")
         if self.model_pbuf_bytes:
             return self.model_pbuf_bytes
         elif self.treelite_handle:
@@ -179,6 +185,7 @@ class BaseRandomForestModel(Base):
                                       & model_pbuf_mv[model_pbuf_mv.shape[0]])
         else:
             model_pbuf_vec = <vector[unsigned char] &> bytearray()
+
         if self.RF_type == CLASSIFICATION:
             build_treelite_forest(
                 & cuml_model_ptr,
@@ -198,7 +205,7 @@ class BaseRandomForestModel(Base):
         self.treelite_handle = ctypes.c_void_p(mod_ptr).value
         return self.treelite_handle
 
-    def _dataset_setup(self, X, y, convert_dtype):
+    def _dataset_setup_for_fit(self, X, y, convert_dtype):
         self._set_output_type(X)
 
         # Reset the old tree data for new fit call
@@ -210,6 +217,7 @@ class BaseRandomForestModel(Base):
         if self.n_bins > self.n_rows:
             raise ValueError("The number of bins,`n_bins` can not be greater"
                              " than the number of samples used for training.")
+
         if self.RF_type == CLASSIFICATION:
             y_m, _, _, y_dtype = \
                 input_to_cuml_array(
@@ -330,23 +338,23 @@ class BaseRandomForestModel(Base):
                                         predict_proba=predict_proba)
         return preds
 
-    def _get_params(self, model, deep):
+    def _get_params(self, deep):
         params = dict()
-        for key in model.variables:
+        for key in BaseRandomForestModel.variables:
             if key in ['handle']:
                 continue
             var_value = getattr(self, key, None)
             params[key] = var_value
         return params
 
-    def _set_params(self, model, **params):
+    def _set_params(self, **params):
         self.handle.__setstate__(self.n_streams)
         self.model_pbuf_bytes = []
 
         if not params:
             return self
         for key, value in params.items():
-            if key not in model.variables:
+            if key not in BaseRandomForestModel.variables:
                 raise ValueError('Invalid parameter for estimator')
             else:
                 setattr(self, key, value)
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 205fb5b0c4..9a20b7da06 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -21,38 +21,34 @@
 # cython: language_level = 3
 
 import ctypes
-import cudf
-import cupy as cp
-import math
 import numpy as np
 import rmm
 import warnings
 
-from libcpp cimport bool
-from libcpp.vector cimport vector
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport calloc, malloc, free
-
-from cython.operator cimport dereference as deref
+import cuml.common.logger as logger
 
 from cuml import ForestInference
-from cuml.fil.fil import TreeliteModel
-
 from cuml.common.array import CumlArray
 from cuml.common.handle import Handle
-from cuml.ensemble.randomforest_common import BaseRandomForestModel
+from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
-from cuml.common.handle cimport cumlHandle
-from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
-    _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
-
+from cuml.ensemble.randomforest_common import _obtain_treelite_model, \
+    _obtain_fil_model
 from cuml.ensemble.randomforest_shared cimport *
-import cuml.common.logger as logger
-from cuml.common import input_to_cuml_array, rmm_cupy_ary
+
+from cuml.fil.fil import TreeliteModel
+
+from cython.operator cimport dereference as deref
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
 
+from cuml.common.handle cimport cumlHandle
 cimport cuml.common.handle
 cimport cuml.common.cuda
 
@@ -238,10 +234,10 @@ class RandomForestClassifier(BaseRandomForestModel):
 
         self.RF_type = CLASSIFICATION
         self.num_classes = 2
-        self._create_model(model=RandomForestClassifier,
-                           split_criterion=split_criterion,
-                           seed=seed, n_streams=n_streams,
-                           **kwargs)
+        super(RandomForestClassifier, self)._create_model(
+            split_criterion=split_criterion,
+            seed=seed, n_streams=n_streams,
+            **kwargs)
 
     """
     TODO:
@@ -413,7 +409,8 @@ class RandomForestClassifier(BaseRandomForestModel):
 
         """
 
-        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
+        X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
+                                                                convert_dtype)
         cdef uintptr_t X_ptr, y_ptr
         X_ptr = X_m.ptr
         y_ptr = y_m.ptr
@@ -532,7 +529,7 @@ class RandomForestClassifier(BaseRandomForestModel):
 
     def predict(self, X, predict_model="GPU",
                 output_class=True, threshold=0.5,
-                algo='auto',
+                algo='auto', num_classes=None,
                 convert_dtype=True,
                 fil_sparse_format='auto'):
         """
@@ -570,7 +567,7 @@ class RandomForestClassifier(BaseRandomForestModel):
             Threshold used for classification. Optional and required only
             while performing the predict operation on the GPU.
             It is applied if output_class == True, else it is ignored
-        num_classes : int (default = 2)
+        num_classes : int (default = None)
             number of different classes present in the dataset. This variable
             will be depricated in 0.16
         convert_dtype : bool, optional (default = True)
@@ -592,7 +589,12 @@ class RandomForestClassifier(BaseRandomForestModel):
         y : NumPy
            Dense vector (int) of shape (n_samples, 1)
         """
-        if predict_model == "CPU" or self.num_classes > 2:
+        if (num_classes and self.num_classes != num_classes):
+            raise ValueError("The number of classes in the test dataset"
+                             " should be equal to the number of classes"
+                             " present in the training dataset.")
+
+        elif predict_model == "CPU" or self.num_classes > 2:
             if self.num_classes > 2 and predict_model == "GPU":
                 warnings.warn("Switching over to use the CPU predict since "
                               "the GPU predict currently cannot perform "
@@ -681,7 +683,8 @@ class RandomForestClassifier(BaseRandomForestModel):
     def predict_proba(self, X, output_class=True,
                       threshold=0.5, algo='auto',
                       convert_dtype=True,
-                      fil_sparse_format='auto'):
+                      fil_sparse_format='auto',
+                      num_classes=None):
         """
         Predicts class probabilites for X. This function uses the GPU
         implementation of predict. Therefore, data with 'dtype = np.float32'
@@ -717,8 +720,9 @@ class RandomForestClassifier(BaseRandomForestModel):
             Threshold used for classification. Optional and required only
             while performing the predict operation on the GPU.
             It is applied if output_class == True, else it is ignored
-        num_classes : int (default = 2)
-            number of different classes present in the dataset
+        num_classes : int (default = None)
+            number of different classes present in the dataset. This variable
+            will be depricated in 0.16
         convert_dtype : bool, optional (default = True)
             When set to True, the predict method will, when necessary, convert
             the input to the data type which was used to train the model. This
@@ -753,6 +757,11 @@ class RandomForestClassifier(BaseRandomForestModel):
                                       "classification models is currently not "
                                       "implemented. Please check cuml issue "
                                       "#1679 for more information.")
+
+        elif (num_classes and self.num_classes != num_classes):
+            raise ValueError("The number of classes in the test dataset"
+                             " should be equal to the number of classes"
+                             " present in the training dataset.")
         preds_proba = \
             self._predict_model_on_gpu(X, output_class=output_class,
                                        threshold=threshold,
@@ -764,7 +773,7 @@ class RandomForestClassifier(BaseRandomForestModel):
         return preds_proba
 
     def score(self, X, y, threshold=0.5,
-              algo='auto', num_classes=2, predict_model="GPU",
+              algo='auto', num_classes=None, predict_model="GPU",
               convert_dtype=True, fil_sparse_format='auto'):
         """
         Calculates the accuracy metric score of the model for X.
@@ -792,8 +801,9 @@ class RandomForestClassifier(BaseRandomForestModel):
             threshold is used to for classification
             This is optional and required only while performing the
             predict operation on the GPU.
-        num_classes : integer
-            number of different classes present in the dataset
+        num_classes : int (default = None)
+            number of different classes present in the dataset. This variable
+            will be depricated in 0.16
         convert_dtype : boolean, default=True
             whether to convert input data to correct dtype automatically
         predict_model : String (default = 'GPU')
@@ -831,6 +841,7 @@ class RandomForestClassifier(BaseRandomForestModel):
                              threshold=threshold, algo=algo,
                              convert_dtype=convert_dtype,
                              predict_model=predict_model,
+                             num_classes=num_classes,
                              fil_sparse_format=fil_sparse_format)
 
         cdef uintptr_t preds_ptr
@@ -875,27 +886,23 @@ class RandomForestClassifier(BaseRandomForestModel):
         """
         Returns the value of all parameters
         required to configure this estimator as a dictionary.
-
         Parameters
         -----------
         deep : boolean (default = True)
         """
-        return self._get_params(model=RandomForestClassifier,
-                                deep=deep)
+        return self._get_params(deep=deep)
 
     def set_params(self, **params):
         """
         Sets the value of parameters required to
         configure this estimator, it functions similar to
         the sklearn set_params.
-
         Parameters
         -----------
         params : dict of new params
         """
-        # Resetting handle as __setstate__ overwrites with handle=Non
-        return self._set_params(model=RandomForestClassifier,
-                                **params)
+        # Resetting handle as __setstate__ overwrites with handle=None
+        return self._set_params(**params)
 
     def print_summary(self):
         """
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 2f41898113..3e5317660e 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -20,34 +20,34 @@
 # cython: language_level = 3
 
 import ctypes
-import cudf
-import math
 import numpy as np
+import rmm
 import warnings
 
-from libcpp cimport bool
-from libcpp.vector cimport vector
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport calloc, malloc, free
+import cuml.common.logger as logger
 
 from cuml import ForestInference
-from cuml.fil.fil import TreeliteModel
 from cuml.common.array import CumlArray
 from cuml.common.handle import Handle
-from cuml.ensemble.randomforest_common import BaseRandomForestModel
-from cuml.common.handle cimport cumlHandle
-from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
-    _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
-from cuml.ensemble.randomforest_common import BaseRandomForestModel
+from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
+from cuml.ensemble.randomforest_common import BaseRandomForestModel
+from cuml.ensemble.randomforest_common import _obtain_treelite_model, \
+    _obtain_fil_model
 from cuml.ensemble.randomforest_shared cimport *
-from cuml.common import input_to_cuml_array
-import cuml.common.logger as logger
+
+from cuml.fil.fil import TreeliteModel
 
 from cython.operator cimport dereference as deref
 
+from libcpp cimport bool
+from libcpp.vector cimport vector
+from libc.stdint cimport uintptr_t
+from libc.stdlib cimport calloc, malloc, free
+
 from numba import cuda
 
+from cuml.common.handle cimport cumlHandle
 cimport cuml.common.handle
 cimport cuml.common.cuda
 
@@ -216,11 +216,11 @@ class RandomForestRegressor(BaseRandomForestModel):
                  accuracy_metric='mse', n_streams=8,
                  **kwargs):
         self.RF_type = REGRESSION
-        self._create_model(model=RandomForestRegressor,
-                           split_criterion=split_criterion,
-                           seed=seed, n_streams=n_streams,
-                           accuracy_metric=accuracy_metric,
-                           **kwargs)
+        super(RandomForestRegressor, self)._create_model(
+            split_criterion=split_criterion,
+            seed=seed, n_streams=n_streams,
+            accuracy_metric=accuracy_metric,
+            **kwargs)
     """
     TODO:
         Add the preprocess and postprocess functions
@@ -306,7 +306,6 @@ class RandomForestRegressor(BaseRandomForestModel):
         tl_to_fil_model : Treelite version of this model
         """
         handle = self._obtain_treelite_handle()
-
         return _obtain_treelite_model(handle)
 
     def convert_to_fil_model(self, output_class=False,
@@ -317,7 +316,7 @@ class RandomForestRegressor(BaseRandomForestModel):
         Random Forest model.
         Parameters
         ----------
-        output_class : boolean (default = True)
+        output_class : boolean (default = False)
             This is optional and required only while performing the
             predict operation on the GPU.
             If true, return a 1 or 0 depending on whether the raw
@@ -377,7 +376,8 @@ class RandomForestRegressor(BaseRandomForestModel):
             ndarray, cuda array interface compliant array like CuPy
             These labels should be contiguous integers from 0 to n_classes.
         """
-        X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype)
+        X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
+                                                                convert_dtype)
 
         # Reset the old tree data for new fit call
         cdef uintptr_t X_ptr, y_ptr
@@ -663,27 +663,23 @@ class RandomForestRegressor(BaseRandomForestModel):
         """
         Returns the value of all parameters
         required to configure this estimator as a dictionary.
-
         Parameters
         -----------
         deep : boolean (default = True)
         """
-        return self._get_params(model=RandomForestRegressor,
-                                deep=deep)
+        return self._get_params(deep=deep)
 
     def set_params(self, **params):
         """
         Sets the value of parameters required to
         configure this estimator, it functions similar to
         the sklearn set_params.
-
         Parameters
         -----------
         params : dict of new params
         """
         # Resetting handle as __setstate__ overwrites with handle=None
-        return self._set_params(model=RandomForestRegressor,
-                                **params)
+        return self._set_params(**params)
 
     def print_summary(self):
         """

From 728600f3dd01cc101d920a8995c002fdae265898 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 29 May 2020 17:35:27 -0500
Subject: [PATCH 12/32] fixed style errors

---
 python/cuml/ensemble/randomforest_shared.pxd  | 31 +++----------------
 .../cuml/ensemble/randomforestregressor.pyx   |  4 +--
 2 files changed, 7 insertions(+), 28 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd
index 03665439e8..0531685efa 100644
--- a/python/cuml/ensemble/randomforest_shared.pxd
+++ b/python/cuml/ensemble/randomforest_shared.pxd
@@ -33,12 +33,10 @@ from cuml.common.handle import Handle
 from cuml import ForestInference
 from cuml.common.base import Base
 from cuml.common.handle cimport cumlHandle
-from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \
+from cuml.common import get_cudf_column_ptr, get_dev_array_ptr, \
     input_to_dev_array, zeros
 cimport cuml.common.handle
 cimport cuml.common.cuda
-cimport cython
-
 
 cdef extern from "treelite/c_api.h":
     ctypedef void* ModelHandle
@@ -47,7 +45,7 @@ cdef extern from "treelite/c_api.h":
                                          ModelHandle model)
     cdef const char* TreeliteGetLastError()
 
-cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
+cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
     cdef enum CRITERION:
         GINI,
         ENTROPY,
@@ -55,15 +53,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
         MAE,
         CRITERION_END
 
-cdef extern from "cuml/tree/flatnode.h" namespace "ML::Flatnode" nogil:
-    cdef cppclass SparseTreeNode[T, L]:
-        L prediction
-        int colid
-        T quesval
-        T best_metric_val
-        int left_child_id
-
-cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil:
+cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree":
     cdef struct DecisionTreeParams:
         int max_depth
         int max_leaves
@@ -75,15 +65,7 @@ cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil
         bool quantile_per_tree
         CRITERION split_criterion
 
-    cdef cppclass TreeMetaDataNode[T, L]:
-        int treeid
-        int depth_counter
-        int leaf_counter
-        double prepare_time
-        double train_time
-        vector[SparseTreeNode[T, L]] sparsetree
-
-cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
+cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
 
     cdef enum RF_type:
         CLASSIFICATION,
@@ -108,20 +90,18 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
         pass
 
     cdef cppclass RandomForestMetaData[T, L]:
-        ctypedef TreeMetaDataNode[T, L]* trees
+        void* trees
         RF_params rf_params
 
     #
     # Treelite handling
     #
-
     cdef void build_treelite_forest[T, L](ModelHandle*,
                                           RandomForestMetaData[T, L]*,
                                           int,
                                           int,
                                           vector[unsigned char] &) except +
 
-
     cdef vector[unsigned char] save_model_protobuf(ModelHandle) except +
 
     cdef void delete_rf_metadata[T, L](RandomForestMetaData[T, L]*) except +
@@ -148,4 +128,3 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil:
 
     cdef ModelHandle concatenate_trees(
         vector[ModelHandle] &treelite_handles) except +
-
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 3e5317660e..92f4522ab4 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -55,7 +55,7 @@ cimport cython
 
 
 cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
-    
+
     cdef void fit(cumlHandle & handle,
                   RandomForestMetaData[float, float]*,
                   float*,
@@ -73,7 +73,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML":
                   double*,
                   RF_params,
                   int) except +
-    
+
     cdef void predict(cumlHandle& handle,
                       RandomForestMetaData[float, float] *,
                       float*,

From a7072d76f1823a7bdec94eeead01a1e5799a4313 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Sun, 31 May 2020 17:05:02 -0500
Subject: [PATCH 13/32] add error to obtain_treelite_handle func

---
 python/cuml/ensemble/randomforest_common.pyx    | 6 ++++++
 python/cuml/ensemble/randomforestclassifier.pyx | 1 +
 2 files changed, 7 insertions(+)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 23d99ad492..af5166973f 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -187,6 +187,12 @@ class BaseRandomForestModel(Base):
             model_pbuf_vec = <vector[unsigned char] &> bytearray()
 
         if self.RF_type == CLASSIFICATION:
+            if self.num_classes > 2:
+                raise NotImplementedError("Pickling for multi-class "
+                                          "classification models is currently"
+                                          "  not implemented. Please check"
+                                          "  cuml issue #1679 for more"
+                                          "  information.")
             build_treelite_forest(
                 & cuml_model_ptr,
                 <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 9a20b7da06..6266ab57b5 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -594,6 +594,7 @@ class RandomForestClassifier(BaseRandomForestModel):
                              " should be equal to the number of classes"
                              " present in the training dataset.")
 
+
         elif predict_model == "CPU" or self.num_classes > 2:
             if self.num_classes > 2 and predict_model == "GPU":
                 warnings.warn("Switching over to use the CPU predict since "

From b120d7f774fe519bc175baf37ab50f85cfc58e58 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Sun, 31 May 2020 17:07:10 -0500
Subject: [PATCH 14/32] fix spacing issue

---
 python/cuml/ensemble/randomforestclassifier.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 6266ab57b5..9a20b7da06 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -594,7 +594,6 @@ class RandomForestClassifier(BaseRandomForestModel):
                              " should be equal to the number of classes"
                              " present in the training dataset.")
 
-
         elif predict_model == "CPU" or self.num_classes > 2:
             if self.num_classes > 2 and predict_model == "GPU":
                 warnings.warn("Switching over to use the CPU predict since "

From 81a34f3c5d2bd71b38283263a18e6388ff790e6e Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Tue, 2 Jun 2020 16:52:01 -0500
Subject: [PATCH 15/32] update rf common

---
 python/cuml/ensemble/randomforest_common.pyx | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index af5166973f..0385ac42a7 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -109,7 +109,6 @@ class BaseRandomForestModel(Base):
         self.max_depth = max_depth
         self.max_features = max_features
         self.bootstrap = bootstrap
-        self.verbose = verbose
         self.n_bins = n_bins
         self.n_cols = None
         self.dtype = dtype

From 494cdf69ad8b90ed866aa6e4e0c2363a1a50fa39 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Tue, 2 Jun 2020 17:38:25 -0500
Subject: [PATCH 16/32] remove rf common py file

---
 python/cuml/ensemble/randomforest_common.py | 94 ---------------------
 1 file changed, 94 deletions(-)
 delete mode 100644 python/cuml/ensemble/randomforest_common.py

diff --git a/python/cuml/ensemble/randomforest_common.py b/python/cuml/ensemble/randomforest_common.py
deleted file mode 100644
index 5b3ecd89b7..0000000000
--- a/python/cuml/ensemble/randomforest_common.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#
-# Copyright (c) 2020, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from cuml import ForestInference
-from cuml.fil.fil import TreeliteModel as tl
-
-
-def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
-    storage_format = _check_fil_sparse_format_value(fil_sparse_format)
-    if (depth > 16 and (storage_format == 'dense' or
-                        algo == 'tree_reorg' or
-                        algo == 'batch_tree_reorg')):
-        raise ValueError("While creating a forest with max_depth greater "
-                         "than 16, `fil_sparse_format` should be True. "
-                         "If `fil_sparse_format=False` then the memory"
-                         "consumed while creating the FIL forest is very "
-                         "large and the process will be aborted. In "
-                         "addition, `algo` must be either set to `naive' "
-                         "or `auto` to set 'fil_sparse_format=True`.")
-    return storage_format
-
-
-def _check_fil_sparse_format_value(fil_sparse_format):
-    accepted_vals = [True, False, 'auto']
-    if fil_sparse_format == 'auto':
-        storage_format = fil_sparse_format
-    elif not fil_sparse_format:
-        storage_format = 'dense'
-    elif fil_sparse_format not in accepted_vals:
-        raise ValueError("The value entered for spares_forest is not "
-                         "supported. Please refer to the documentation "
-                         "to see the accepted values.")
-    else:
-        storage_format = 'sparse'
-
-    return storage_format
-
-
-def _obtain_treelite_model(treelite_handle):
-    """
-    Creates a Treelite model using the treelite handle
-    obtained from the cuML Random Forest model.
-
-    Returns
-    ----------
-    tl_to_fil_model : Treelite version of this model
-    """
-    treelite_model = \
-        tl.from_treelite_model_handle(treelite_handle)
-    return treelite_model
-
-
-def _obtain_fil_model(treelite_handle, depth,
-                      output_class=True,
-                      threshold=0.5, algo='auto',
-                      fil_sparse_format='auto'):
-    """
-    Creates a Forest Inference (FIL) model using the treelite
-    handle obtained from the cuML Random Forest model.
-
-    Returns
-    ----------
-    fil_model :
-        A Forest Inference model which can be used to perform
-        inferencing on the random forest model.
-    """
-
-    storage_format = \
-        _check_fil_parameter_validity(depth=depth,
-                                      fil_sparse_format=fil_sparse_format,
-                                      algo=algo)
-
-    fil_model = ForestInference()
-    tl_to_fil_model = \
-        fil_model.load_using_treelite_handle(treelite_handle,
-                                             output_class=output_class,
-                                             threshold=threshold,
-                                             algo=algo,
-                                             storage_type=storage_format)
-
-    return tl_to_fil_model

From 8f77cdf2623350dae2bf9dc052c60b4af1b6126d Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 3 Jun 2020 06:49:18 -0500
Subject: [PATCH 17/32] update docs

---
 python/cuml/ensemble/randomforest_common.pyx    | 1 +
 python/cuml/ensemble/randomforestclassifier.pyx | 1 -
 python/cuml/ensemble/randomforestregressor.pyx  | 1 -
 3 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 0385ac42a7..97bbfe77a2 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -353,6 +353,7 @@ class BaseRandomForestModel(Base):
         return params
 
     def _set_params(self, **params):
+        # Resetting handle as __setstate__ overwrites with handle=None
         self.handle.__setstate__(self.n_streams)
         self.model_pbuf_bytes = []
 
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 9a20b7da06..1ec8179b92 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -901,7 +901,6 @@ class RandomForestClassifier(BaseRandomForestModel):
         -----------
         params : dict of new params
         """
-        # Resetting handle as __setstate__ overwrites with handle=None
         return self._set_params(**params)
 
     def print_summary(self):
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 92f4522ab4..9eb67631c8 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -678,7 +678,6 @@ class RandomForestRegressor(BaseRandomForestModel):
         -----------
         params : dict of new params
         """
-        # Resetting handle as __setstate__ overwrites with handle=None
         return self._set_params(**params)
 
     def print_summary(self):

From 5866f1797c0a99a310f2c1930ef8ef07ab1e8c1f Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 10 Jun 2020 15:18:31 -0500
Subject: [PATCH 18/32] update _create_model func to __init__

---
 python/cuml/ensemble/randomforest_common.pyx  | 35 +++++++++++--------
 .../cuml/ensemble/randomforestclassifier.pyx  | 22 +++++-------
 .../cuml/ensemble/randomforestregressor.pyx   | 13 +++----
 3 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 97bbfe77a2..d0c7ef02f0 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -43,20 +43,20 @@ class BaseRandomForestModel(Base):
                  'verbose', 'rows_sample',
                  'max_leaves', 'quantile_per_tree']
 
-    def _create_model(self, seed, split_criterion,
-                      n_streams, n_estimators=100,
-                      max_depth=16, handle=None, max_features='auto',
-                      n_bins=8, split_algo=1, bootstrap=True,
-                      bootstrap_features=False,
-                      verbose=False, min_rows_per_node=2,
-                      rows_sample=1.0, max_leaves=-1,
-                      accuracy_metric=None, dtype=None,
-                      output_type=None, min_samples_leaf=None,
-                      min_weight_fraction_leaf=None, n_jobs=None,
-                      max_leaf_nodes=None, min_impurity_decrease=0.0,
-                      min_impurity_split=None, oob_score=None,
-                      random_state=None, warm_start=None, class_weight=None,
-                      quantile_per_tree=False, criterion=None):
+    def __init__(self, split_criterion, seed=None,
+                 n_streams=8, n_estimators=100,
+                 max_depth=16, handle=None, max_features='auto',
+                 n_bins=8, split_algo=1, bootstrap=True,
+                 bootstrap_features=False,
+                 verbose=False, min_rows_per_node=2,
+                 rows_sample=1.0, max_leaves=-1,
+                 accuracy_metric=None, dtype=None,
+                 output_type=None, min_samples_leaf=None,
+                 min_weight_fraction_leaf=None, n_jobs=None,
+                 max_leaf_nodes=None, min_impurity_decrease=0.0,
+                 min_impurity_split=None, oob_score=None,
+                 random_state=None, warm_start=None, class_weight=None,
+                 quantile_per_tree=False, criterion=None):
 
         if accuracy_metric:
             BaseRandomForestModel.variables.append('accuracy_metric')
@@ -78,6 +78,13 @@ class BaseRandomForestModel(Base):
                                 " please read the cuML documentation for"
                                 " more information")
 
+        if ((seed is not None) and (n_streams != 1)):
+            warnings.warn("For reproducible results in Random Forest"
+                          " Classifier and for almost reproducible results"
+                          " in Random Forest Regressor, n_streams==1 is "
+                          "recommended. If n_streams is > 1, results may vary "
+                          "due to stream/thread timing differences, even when "
+                          "random_seed is set")
         if handle is None:
             handle = Handle(n_streams)
 
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 1ec8179b92..12f13f0af3 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -224,19 +224,13 @@ class RandomForestClassifier(BaseRandomForestModel):
         Seed for the random number generator. Unseeded by default.
     """
 
-    def __init__(self, split_criterion=0, seed=None,
-                 n_streams=8, **kwargs):
-        if ((seed is not None) and (n_streams != 1)):
-            warnings.warn("For reproducible results, n_streams==1 is "
-                          "recommended. If n_streams is > 1, results may vary "
-                          "due to stream/thread timing differences, even when "
-                          "random_seed is set")
+    def __init__(self, split_criterion=0,
+                 **kwargs):
 
         self.RF_type = CLASSIFICATION
         self.num_classes = 2
-        super(RandomForestClassifier, self)._create_model(
+        super(RandomForestClassifier, self).__init__(
             split_criterion=split_criterion,
-            seed=seed, n_streams=n_streams,
             **kwargs)
 
     """
@@ -248,7 +242,6 @@ class RandomForestClassifier(BaseRandomForestModel):
     """
     def __getstate__(self):
         state = self.__dict__.copy()
-        del state['handle']
         cdef size_t params_t
         cdef  RandomForestMetaData[float, int] *rf_forest
         cdef  RandomForestMetaData[double, int] *rf_forest64
@@ -268,16 +261,19 @@ class RandomForestClassifier(BaseRandomForestModel):
                     <RandomForestMetaData[double, int]*>params_t64
                 state["rf_params64"] = rf_forest64.rf_params
 
-        state['n_cols'] = self.n_cols
+        state["n_cols"] = self.n_cols
         state["verbose"] = self.verbose
         state["model_pbuf_bytes"] = self.model_pbuf_bytes
         state["treelite_handle"] = None
-
+        state["split_criterion"] = self.split_criterion
+        state["handle"] = self.handle
         return state
 
     def __setstate__(self, state):
         super(RandomForestClassifier, self).__init__(
-            handle=None, verbose=state['verbose'])
+            split_criterion=state["split_criterion"],
+            handle=state["handle"],
+            verbose=state["verbose"])
         cdef  RandomForestMetaData[float, int] *rf_forest = \
             new RandomForestMetaData[float, int]()
         cdef  RandomForestMetaData[double, int] *rf_forest64 = \
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 9eb67631c8..eead99e24f 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -212,13 +212,12 @@ class RandomForestRegressor(BaseRandomForestModel):
 
     """
 
-    def __init__(self, split_criterion=2, seed=None,
-                 accuracy_metric='mse', n_streams=8,
+    def __init__(self, split_criterion=2,
+                 accuracy_metric='mse',
                  **kwargs):
         self.RF_type = REGRESSION
-        super(RandomForestRegressor, self)._create_model(
+        super(RandomForestRegressor, self).__init__(
             split_criterion=split_criterion,
-            seed=seed, n_streams=n_streams,
             accuracy_metric=accuracy_metric,
             **kwargs)
     """
@@ -228,7 +227,6 @@ class RandomForestRegressor(BaseRandomForestModel):
     """
     def __getstate__(self):
         state = self.__dict__.copy()
-        del state['handle']
         cdef size_t params_t
         cdef  RandomForestMetaData[float, float] *rf_forest
         cdef  RandomForestMetaData[double, double] *rf_forest64
@@ -252,12 +250,15 @@ class RandomForestRegressor(BaseRandomForestModel):
         state["verbose"] = self.verbose
         state["model_pbuf_bytes"] = self.model_pbuf_bytes
         state["treelite_handle"] = None
+        state["split_criterion"] = self.split_criterion
+        state["handle"] = self.handle
 
         return state
 
     def __setstate__(self, state):
         super(RandomForestRegressor, self).__init__(
-            handle=None, verbose=state['verbose'])
+            split_criterion=state["split_criterion"],
+            handle=state["handle"], verbose=state['verbose'])
         cdef  RandomForestMetaData[float, float] *rf_forest = \
             new RandomForestMetaData[float, float]()
         cdef  RandomForestMetaData[double, double] *rf_forest64 = \

From 3bfe631a8f6effcd84384a8d8a3129ddfc28870f Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 10 Jun 2020 15:22:08 -0500
Subject: [PATCH 19/32] update n_streams warning

---
 python/cuml/ensemble/randomforest_common.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index d0c7ef02f0..e474b9fb87 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -80,7 +80,7 @@ class BaseRandomForestModel(Base):
 
         if ((seed is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results in Random Forest"
-                          " Classifier and for almost reproducible results"
+                          " Classifier or for almost reproducible results"
                           " in Random Forest Regressor, n_streams==1 is "
                           "recommended. If n_streams is > 1, results may vary "
                           "due to stream/thread timing differences, even when "

From 2a31c0c8eb313addf9654642610228bd2bfa53d1 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 17 Jun 2020 09:51:37 -0500
Subject: [PATCH 20/32] update code based on reviews

---
 python/cuml/ensemble/__init__.py              |   2 +-
 python/cuml/ensemble/randomforest_common.pyx  | 111 ++++++++++++------
 .../cuml/ensemble/randomforestclassifier.pyx  |   4 +-
 .../cuml/ensemble/randomforestregressor.pyx   |   6 +-
 python/cuml/fil/fil.pyx                       |  18 +--
 5 files changed, 90 insertions(+), 51 deletions(-)

diff --git a/python/cuml/ensemble/__init__.py b/python/cuml/ensemble/__init__.py
index 1bb48ec807..7cd2567acf 100644
--- a/python/cuml/ensemble/__init__.py
+++ b/python/cuml/ensemble/__init__.py
@@ -18,4 +18,4 @@
 from cuml.ensemble.randomforestclassifier import RandomForestClassifier
 from cuml.ensemble.randomforestregressor import RandomForestRegressor
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
-    _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model
+    _obtain_treelite_model, _obtain_fil_model
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index e474b9fb87..6af8d16d35 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -42,6 +42,8 @@ class BaseRandomForestModel(Base):
                  'bootstrap', 'bootstrap_features',
                  'verbose', 'rows_sample',
                  'max_leaves', 'quantile_per_tree']
+    criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
+                      '3': MAE, '4': CRITERION_END}
 
     def __init__(self, split_criterion, seed=None,
                  n_streams=8, n_estimators=100,
@@ -73,10 +75,12 @@ class BaseRandomForestModel(Base):
 
         for key, vals in sklearn_params.items():
             if vals:
-                raise TypeError(" The Scikit-learn variable ", key,
-                                " is not supported in cuML,"
-                                " please read the cuML documentation for"
-                                " more information")
+                raise TypeError(
+                    " The Scikit-learn variable ", key,
+                    " is not supported in cuML,"
+                    " please read the cuML documentation at "
+                    "(https://docs.rapids.ai/api/cuml/nightly/"
+                    "api.html#random-forest) for more information")
 
         if ((seed is not None) and (n_streams != 1)):
             warnings.warn("For reproducible results in Random Forest"
@@ -97,15 +101,15 @@ class BaseRandomForestModel(Base):
             raise ValueError("Must specify max_depth >0 ")
 
         self.split_algo = split_algo
-        criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE,
-                          '3': MAE, '4': CRITERION_END}
-        if str(split_criterion) not in criterion_dict.keys():
+        if (str(split_criterion) not in
+                BaseRandomForestModel.criterion_dict.keys()):
             warnings.warn("The split criterion chosen was not present"
                           " in the list of options accepted by the model"
                           " and so the CRITERION_END option has been chosen.")
             self.split_criterion = CRITERION_END
         else:
-            self.split_criterion = criterion_dict[str(split_criterion)]
+            self.split_criterion = \
+                BaseRandomForestModel.criterion_dict[str(split_criterion)]
 
         self.min_rows_per_node = min_rows_per_node
         self.min_impurity_decrease = min_impurity_decrease
@@ -143,27 +147,33 @@ class BaseRandomForestModel(Base):
             else:
                 return 1.0
         else:
-            raise ValueError("Wrong value passed in for max_features"
-                             " please read the documentation")
+            raise ValueError(
+                "Wrong value passed in for max_features"
+                " please read the documentation present at "
+                "(https://docs.rapids.ai/api/cuml/nightly/api.html"
+                "#random-forest)")
 
     def _get_protobuf_bytes(self):
         """
         Returns the self.model_pbuf_bytes.
         Cuml RF model gets converted to treelite protobuf bytes by:
+
             1. converting the cuml RF model to a treelite model. The treelite
             models handle (pointer) is returned
+
             2. The treelite model handle is used to convert the treelite model
             to a treelite protobuf model which is stored in a temporary file.
             The protobuf model information is read from the temporary file and
             the byte information is returned.
+
         The treelite handle is stored `self.treelite_handle` and the treelite
         protobuf model bytes are stored in `self.model_pbuf_bytes`. If either
         of information is already present in the model then the respective
         step is skipped.
         """
         if self.dtype == np.float64:
-            raise TypeError("To use pickling, first train the model"
-                            " using float 32 data.")
+            raise TypeError("Pickling is only supported on models trained"
+                            " on float32 data.")
         if self.model_pbuf_bytes:
             return self.model_pbuf_bytes
         elif self.treelite_handle:
@@ -197,7 +207,7 @@ class BaseRandomForestModel(Base):
                 raise NotImplementedError("Pickling for multi-class "
                                           "classification models is currently"
                                           "  not implemented. Please check"
-                                          "  cuml issue #1679 for more"
+                                          "  cuml GitHub issue #1679 for more"
                                           "  information.")
             build_treelite_forest(
                 & cuml_model_ptr,
@@ -239,7 +249,7 @@ class BaseRandomForestModel(Base):
                     check_rows=self.n_rows, check_cols=1)
             if y_dtype != np.int32:
                 raise TypeError("The labels `y` need to be of dtype"
-                                " `np.int32`")
+                                " `int32`")
             unique_labels = rmm_cupy_ary(cp.unique, y_m)
             self.num_classes = len(unique_labels)
             for i in range(self.num_classes):
@@ -271,14 +281,14 @@ class BaseRandomForestModel(Base):
         cdef ModelHandle cuml_model_ptr = NULL
         if self.RF_type == CLASSIFICATION:
             build_treelite_forest(
-                & cuml_model_ptr,
+                &cuml_model_ptr,
                 <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
                 <int> self.n_cols,
                 <int> self.num_classes,
                 <vector[unsigned char] &> model_bytes)
         else:
             build_treelite_forest(
-                & cuml_model_ptr,
+                &cuml_model_ptr,
                 <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
                 <int> self.n_cols,
                 <int> REGRESSION_MODEL,
@@ -292,14 +302,18 @@ class BaseRandomForestModel(Base):
         cdef vector[ModelHandle] *model_handles \
             = new vector[ModelHandle]()
         cdef uintptr_t mod_ptr
+
         for i in treelite_handle:
             mod_ptr = <uintptr_t>i
             model_handles.push_back((
                 <ModelHandle> mod_ptr))
+
         self._reset_forest_data()
+
         concat_model_handle = concatenate_trees(deref(model_handles))
         cdef uintptr_t concat_model_ptr = <uintptr_t> concat_model_handle
         self.treelite_handle = concat_model_ptr
+
         cdef vector[unsigned char] pbuf_mod_info = \
             save_model(<ModelHandle> concat_model_ptr)
         cdef unsigned char[::1] pbuf_mod_view = \
@@ -327,8 +341,8 @@ class BaseRandomForestModel(Base):
             raise TypeError("GPU based predict only accepts np.float32 data. \
                             Please set convert_dtype=True to convert the test \
                             data to the same dtype as the data used to train, \
-                            ie. np.float32. If you would like to use test \
-                            data of dtype=np.float64 please set \
+                            ie. float32. If you would like to use test \
+                            data of dtype=float64 please set \
                             predict_model='CPU' to use the CPU implementation \
                             of predict.")
 
@@ -360,8 +374,6 @@ class BaseRandomForestModel(Base):
         return params
 
     def _set_params(self, **params):
-        # Resetting handle as __setstate__ overwrites with handle=None
-        self.handle.__setstate__(self.n_streams)
         self.model_pbuf_bytes = []
 
         if not params:
@@ -375,8 +387,40 @@ class BaseRandomForestModel(Base):
 
 
 def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
-    storage_format = _check_fil_sparse_format_value(fil_sparse_format)
-    if (depth > 16 and (storage_format == 'dense' or
+    """
+    Check if the FIL storage format type passed by the user is right
+    for the trained cuml Random Forest model they have.
+
+    Parameters
+    ----------
+    depth : max depth value used to train model
+    algo : string (default = 'auto')
+        This is optional and required only while performing the
+        predict operation on the GPU.
+        'naive' - simple inference using shared memory
+        'tree_reorg' - similar to naive but trees rearranged to be more
+        coalescing-friendly
+        'batch_tree_reorg' - similar to tree_reorg but predicting
+        multiple rows per thread block
+        `auto` - choose the algorithm automatically. Currently
+        'batch_tree_reorg' is used for dense storage
+        and 'naive' for sparse storage
+    fil_sparse_format : boolean or string (default = 'auto')
+        This variable is used to choose the type of forest that will be
+        created in the Forest Inference Library. It is not required
+        while using predict_model='CPU'.
+        'auto' - choose the storage type automatically
+        (currently True is chosen by auto)
+        False - create a dense forest
+        True - create a sparse forest, requires algo='naive'
+        or algo='auto'
+    Returns
+    ----------
+    fil_sparse_format converted to string
+    """
+    accepted_fil_spars_format = {True, False, 'auto'}
+
+    if (depth > 16 and (fil_sparse_format is False or
                         algo == 'tree_reorg' or
                         algo == 'batch_tree_reorg')):
         raise ValueError("While creating a forest with max_depth greater "
@@ -386,22 +430,13 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
                          "large and the process will be aborted. In "
                          "addition, `algo` must be either set to `naive' "
                          "or `auto` to set 'fil_sparse_format=True`.")
-    return storage_format
-
-
-def _check_fil_sparse_format_value(fil_sparse_format):
-    accepted_vals = [True, False, 'auto']
-    if fil_sparse_format == 'auto':
-        storage_format = fil_sparse_format
-    elif not fil_sparse_format:
-        storage_format = 'dense'
-    elif fil_sparse_format not in accepted_vals:
-        raise ValueError("The value entered for spares_forest is not "
-                         "supported. Please refer to the documentation "
-                         "to see the accepted values.")
-    else:
-        storage_format = 'sparse'
-    return storage_format
+    if fil_sparse_format not in accepted_fil_spars_format:
+        raise ValueError(
+            "The value entered for spares_forest is not "
+            "supported. Please refer to the documentation at "
+            "(https://docs.rapids.ai/api/cuml/nightly/api.html"
+            "#forest-inferencing) to see the accepted values.")
+    return str(fil_sparse_format)
 
 
 def _obtain_treelite_model(treelite_handle):
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 12f13f0af3..999b73a5d3 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -400,8 +400,8 @@ class RandomForestClassifier(BaseRandomForestModel):
             These labels should be contiguous integers from 0 to n_classes.
         convert_dtype : bool, optional (default = False)
             When set to True, the fit method will, when necessary, convert
-            y to be the same data type as X if they differ. This will increase
-            memory used for the method.
+            y to be of dtype int32. This will increase memory used for
+            the method.
 
         """
 
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index eead99e24f..0c9f58549c 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -334,7 +334,7 @@ class RandomForestRegressor(BaseRandomForestModel):
             `auto` - choose the algorithm automatically. Currently
             'batch_tree_reorg' is used for dense storage
             and 'naive' for sparse storage
-        fil_sparse_format : boolean or string (default = auto)
+        fil_sparse_format : boolean or string (default = 'auto')
             This variable is used to choose the type of forest that will be
             created in the Forest Inference Library. It is not required
             while using predict_model='CPU'.
@@ -376,6 +376,10 @@ class RandomForestRegressor(BaseRandomForestModel):
             Acceptable formats: NumPy ndarray, Numba device
             ndarray, cuda array interface compliant array like CuPy
             These labels should be contiguous integers from 0 to n_classes.
+        convert_dtype : bool, optional (default = False)
+            When set to True, the fit method will, when necessary, convert
+            y to be the same data type as X if they differ. This will increase
+            memory used for the method.
         """
         X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
                                                                 convert_dtype)
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 68e7351bb7..2e239cc462 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -226,15 +226,16 @@ cdef class ForestInference_impl():
         return algo_dict[algo_str]
 
     def get_storage_type(self, storage_type_str):
-        storage_type_dict={'AUTO': storage_type_t.AUTO,
-                           'auto': storage_type_t.AUTO,
-                           'DENSE': storage_type_t.DENSE,
-                           'dense': storage_type_t.DENSE,
-                           'SPARSE': storage_type_t.SPARSE,
-                           'sparse': storage_type_t.SPARSE}
+        storage_type_dict={'auto': storage_type_t.AUTO,
+                           'False': storage_type_t.DENSE,
+                           'True': storage_type_t.SPARSE}
+
         if storage_type_str not in storage_type_dict.keys():
-            raise ValueError(' Wrong sparsity selected please refer'
-                             ' to the documentation')
+            raise ValueError(
+                "The value entered for spares_forest is not "
+                "supported. Please refer to the documentation at"
+                "(https://docs.rapids.ai/api/cuml/nightly/api.html#"
+                "forest-inferencing) to see the accepted values.")
         return storage_type_dict[storage_type_str]
 
     def predict(self, X, output_type='numpy', predict_proba=False, preds=None):
@@ -336,7 +337,6 @@ cdef class ForestInference_impl():
         treelite_params.threshold = threshold
         treelite_params.algo = self.get_algo(algo)
         treelite_params.storage_type = self.get_storage_type(storage_type)
-
         cdef cumlHandle* handle_ =\
             <cumlHandle*><size_t>self.handle.getHandle()
         cdef uintptr_t model_ptr = <uintptr_t>model_handle

From acdc3814e05824f56489eb45adc1a4b0b70482d1 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 17 Jun 2020 11:04:05 -0500
Subject: [PATCH 21/32] fix style errors

---
 python/cuml/ensemble/randomforest_common.pyx |  8 +++---
 python/cuml/fil/fil.pyx                      | 26 ++++++++++----------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 6af8d16d35..8b8e54f56c 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -210,14 +210,14 @@ class BaseRandomForestModel(Base):
                                           "  cuml GitHub issue #1679 for more"
                                           "  information.")
             build_treelite_forest(
-                & cuml_model_ptr,
+                &cuml_model_ptr,
                 <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
                 <int> self.n_cols,
                 <int> self.num_classes,
                 model_pbuf_vec)
         else:
             build_treelite_forest(
-                & cuml_model_ptr,
+                &cuml_model_ptr,
                 <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
                 <int> self.n_cols,
                 <int> REGRESSION_MODEL,
@@ -416,7 +416,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
         or algo='auto'
     Returns
     ----------
-    fil_sparse_format converted to string
+    fil_sparse_format
     """
     accepted_fil_spars_format = {True, False, 'auto'}
 
@@ -436,7 +436,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
             "supported. Please refer to the documentation at "
             "(https://docs.rapids.ai/api/cuml/nightly/api.html"
             "#forest-inferencing) to see the accepted values.")
-    return str(fil_sparse_format)
+    return fil_sparse_format
 
 
 def _obtain_treelite_model(treelite_handle):
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 2e239cc462..69d078164d 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -227,12 +227,12 @@ cdef class ForestInference_impl():
 
     def get_storage_type(self, storage_type_str):
         storage_type_dict={'auto': storage_type_t.AUTO,
-                           'False': storage_type_t.DENSE,
-                           'True': storage_type_t.SPARSE}
+                           False: storage_type_t.DENSE,
+                           True: storage_type_t.SPARSE}
 
         if storage_type_str not in storage_type_dict.keys():
             raise ValueError(
-                "The value entered for spares_forest is not "
+                "The value entered for storage_type is not "
                 "supported. Please refer to the documentation at"
                 "(https://docs.rapids.ai/api/cuml/nightly/api.html#"
                 "forest-inferencing) to see the accepted values.")
@@ -508,11 +508,11 @@ class ForestInference(Base):
             only if output_class == True, else it is ignored.
         storage_type : string (default='auto')
             In-memory storage format to be used for the FIL model.
-             'AUTO' or 'auto' - choose the storage type automatically
-                                (currently DENSE is always used)
-             'DENSE' or 'dense' - create a dense forest
-             'SPARSE' or 'sparse' - create a sparse forest;
-                                    requires algo='NAIVE' or algo='AUTO'
+             'auto' - choose the storage type automatically
+                      (currently DENSE is always used)
+             'False' - create a dense forest
+             'True' - create a sparse forest;
+                      requires algo='NAIVE' or algo='AUTO'
 
         Returns
         ----------
@@ -563,11 +563,11 @@ class ForestInference(Base):
             only if output_class == True, else it is ignored.
         storage_type : string (default='auto')
             In-memory storage format to be used for the FIL model.
-             'AUTO' or 'auto' - choose the storage type automatically
-                                (currently DENSE is always used)
-             'DENSE' or 'dense' - create a dense forest
-             'SPARSE' or 'sparse' - create a sparse forest;
-                                    requires algo='NAIVE' or algo='AUTO'.
+             'auto' - choose the storage type automatically
+                      (currently DENSE is always used)
+             'False' - create a dense forest
+             'True' - create a sparse forest;
+                      requires algo='NAIVE' or algo='AUTO'
 
         Returns
         ----------

From 4ed2fb7cc3306edcbc1b1085ae390cab9b88833c Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 17 Jun 2020 11:11:52 -0500
Subject: [PATCH 22/32] remove obtain_treelite_model func from common

---
 python/cuml/ensemble/randomforest_common.pyx    | 16 +---------------
 python/cuml/ensemble/randomforestclassifier.pyx |  4 ++--
 python/cuml/ensemble/randomforestregressor.pyx  |  5 ++---
 3 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 8b8e54f56c..a24e24365c 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -338,7 +338,7 @@ class BaseRandomForestModel(Base):
                                 check_cols=self.n_cols)
 
         if dtype == np.float64 and not convert_dtype:
-            raise TypeError("GPU based predict only accepts np.float32 data. \
+            raise TypeError("GPU based predict only accepts float32 data. \
                             Please set convert_dtype=True to convert the test \
                             data to the same dtype as the data used to train, \
                             ie. float32. If you would like to use test \
@@ -439,20 +439,6 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
     return fil_sparse_format
 
 
-def _obtain_treelite_model(treelite_handle):
-    """
-    Creates a Treelite model using the treelite handle
-    obtained from the cuML Random Forest model.
-
-    Returns
-    ----------
-    tl_to_fil_model : Treelite version of this model
-    """
-    treelite_model = \
-        TreeliteModel.from_treelite_model_handle(treelite_handle)
-    return treelite_model
-
-
 def _obtain_fil_model(treelite_handle, depth,
                       output_class=True,
                       threshold=0.5, algo='auto',
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 999b73a5d3..4343d32a5d 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -321,8 +321,8 @@ class RandomForestClassifier(BaseRandomForestModel):
         ----------
         tl_to_fil_model : Treelite version of this model
         """
-        handle = self._obtain_treelite_handle()
-        return _obtain_treelite_model(handle)
+        treelite_handle = self._obtain_treelite_handle()
+        return TreeliteModel.from_treelite_model_handle(treelite_handle)
 
     def convert_to_fil_model(self, output_class=True,
                              threshold=0.5, algo='auto',
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 6a270cde53..b8c34b4ac5 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -249,7 +249,6 @@ class RandomForestRegressor(BaseRandomForestModel):
         state['n_cols'] = self.n_cols
         state["verbose"] = self.verbose
         state["model_pbuf_bytes"] = self.model_pbuf_bytes
-        state['handle'] = self.handle
         state["treelite_handle"] = None
         state["split_criterion"] = self.split_criterion
         state["handle"] = self.handle
@@ -307,8 +306,8 @@ class RandomForestRegressor(BaseRandomForestModel):
         ----------
         tl_to_fil_model : Treelite version of this model
         """
-        handle = self._obtain_treelite_handle()
-        return _obtain_treelite_model(handle)
+        treelite_handle = self._obtain_treelite_handle()
+        return TreeliteModel.from_treelite_model_handle(treelite_handle)
 
     def convert_to_fil_model(self, output_class=False,
                              algo='auto',

From f46a9892f7af60bcd125d7c2d192baf16d344231 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 17 Jun 2020 12:27:16 -0500
Subject: [PATCH 23/32] update rf and fil to accept storage type as bool or
 'auto'

---
 python/cuml/ensemble/__init__.py              |  2 +-
 python/cuml/ensemble/randomforest_common.pyx  | 19 ++++++++++---------
 .../cuml/ensemble/randomforestclassifier.pyx  |  3 +--
 .../cuml/ensemble/randomforestregressor.pyx   |  3 +--
 python/cuml/fil/fil.pyx                       |  4 ++--
 5 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/python/cuml/ensemble/__init__.py b/python/cuml/ensemble/__init__.py
index 7cd2567acf..d42ae676b6 100644
--- a/python/cuml/ensemble/__init__.py
+++ b/python/cuml/ensemble/__init__.py
@@ -18,4 +18,4 @@
 from cuml.ensemble.randomforestclassifier import RandomForestClassifier
 from cuml.ensemble.randomforestregressor import RandomForestRegressor
 from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \
-    _obtain_treelite_model, _obtain_fil_model
+    _obtain_fil_model
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index a24e24365c..3f89b6aec1 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -266,15 +266,13 @@ class BaseRandomForestModel(Base):
                     check_rows=self.n_rows, check_cols=1)
 
         if self.dtype == np.float64:
-            warnings.warn("To use GPU-based prediction, first train using \
-                          float 32 data to fit the estimator.")
+            warnings.warn("To use pickling or GPU-based prediction first "
+                          "train using float32 data to fit the estimator")
 
         max_feature_val = self._get_max_feat_val()
         if type(self.min_rows_per_node) == float:
             self.min_rows_per_node = \
                 math.ceil(self.min_rows_per_node*self.n_rows)
-        del X
-        del y
         return X_m, y_m, max_feature_val
 
     def _tl_model_handles(self, model_bytes):
@@ -334,10 +332,13 @@ class BaseRandomForestModel(Base):
         out_type = self._get_output_type(X)
         cdef ModelHandle cuml_model_ptr = NULL
         _, n_rows, n_cols, dtype = \
-            input_to_cuml_array(X, order='F',
-                                check_cols=self.n_cols)
+            input_to_cuml_array(
+                X, order='F',
+                check_cols=self.n_cols,
+                convert_to_dtype=(self.dtype if convert_dtype
+                                  else None))
 
-        if dtype == np.float64 and not convert_dtype:
+        if dtype == np.float64:
             raise TypeError("GPU based predict only accepts float32 data. \
                             Please set convert_dtype=True to convert the test \
                             data to the same dtype as the data used to train, \
@@ -416,7 +417,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
         or algo='auto'
     Returns
     ----------
-    fil_sparse_format
+    fil_sparse_format as a string
     """
     accepted_fil_spars_format = {True, False, 'auto'}
 
@@ -436,7 +437,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
             "supported. Please refer to the documentation at "
             "(https://docs.rapids.ai/api/cuml/nightly/api.html"
             "#forest-inferencing) to see the accepted values.")
-    return fil_sparse_format
+    return str(fil_sparse_format)
 
 
 def _obtain_fil_model(treelite_handle, depth,
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index 4343d32a5d..420482fd10 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -33,8 +33,7 @@ from cuml.common.handle import Handle
 from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
-from cuml.ensemble.randomforest_common import _obtain_treelite_model, \
-    _obtain_fil_model
+from cuml.ensemble.randomforest_common import _obtain_fil_model
 from cuml.ensemble.randomforest_shared cimport *
 
 from cuml.fil.fil import TreeliteModel
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index b8c34b4ac5..1019374381 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -32,8 +32,7 @@ from cuml.common.handle import Handle
 from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
 from cuml.ensemble.randomforest_common import BaseRandomForestModel
-from cuml.ensemble.randomforest_common import _obtain_treelite_model, \
-    _obtain_fil_model
+from cuml.ensemble.randomforest_common import _obtain_fil_model
 from cuml.ensemble.randomforest_shared cimport *
 
 from cuml.fil.fil import TreeliteModel
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 69d078164d..1901d3e61f 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -227,8 +227,8 @@ cdef class ForestInference_impl():
 
     def get_storage_type(self, storage_type_str):
         storage_type_dict={'auto': storage_type_t.AUTO,
-                           False: storage_type_t.DENSE,
-                           True: storage_type_t.SPARSE}
+                           'False': storage_type_t.DENSE,
+                           'True': storage_type_t.SPARSE}
 
         if storage_type_str not in storage_type_dict.keys():
             raise ValueError(

From a5fecceb60293a2a21ad505d7fdbb70245655531 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Wed, 17 Jun 2020 12:38:26 -0500
Subject: [PATCH 24/32] update docs

---
 python/cuml/ensemble/randomforest_common.pyx | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 3f89b6aec1..6afeec2c92 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -158,10 +158,9 @@ class BaseRandomForestModel(Base):
         Returns the self.model_pbuf_bytes.
         Cuml RF model gets converted to treelite protobuf bytes by:
 
-            1. converting the cuml RF model to a treelite model. The treelite
+            * Converting the cuml RF model to a treelite model. The treelite
             models handle (pointer) is returned
-
-            2. The treelite model handle is used to convert the treelite model
+            * The treelite model handle is used to convert the treelite model
             to a treelite protobuf model which is stored in a temporary file.
             The protobuf model information is read from the temporary file and
             the byte information is returned.

From aee0037e8598d6f19f809fe2968430926e6f1f07 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 19 Jun 2020 11:26:48 -0500
Subject: [PATCH 25/32] update rf code

---
 python/cuml/dask/ensemble/base.py             |  5 +-
 .../dask/ensemble/randomforestclassifier.py   | 20 ++-----
 .../dask/ensemble/randomforestregressor.py    | 17 ++----
 python/cuml/ensemble/randomforest_common.pyx  | 56 +++++++++----------
 4 files changed, 43 insertions(+), 55 deletions(-)

diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
index fe94321db0..e4b3dc9e75 100644
--- a/python/cuml/dask/ensemble/base.py
+++ b/python/cuml/dask/ensemble/base.py
@@ -15,6 +15,7 @@
 
 import dask
 import math
+from dask.distributed import wait
 
 from cuml.dask.common.input_utils import DistributedDataHandler, \
     concatenate
@@ -124,10 +125,12 @@ def _concat_treelite_models(self):
         model._concatenate_treelite_handle(all_tl_mod_handles)
         for tl_handle in all_tl_mod_handles:
             TreeliteModel.free_treelite_model(tl_handle)
-
+        wait(model)
         return model
 
     def _predict_using_fil(self, X, delayed, **kwargs):
+        if self.local_model is None:
+            self.local_model = self._concat_treelite_models()
         data = DistributedDataHandler.create(X, client=self.client)
         self.datatype = data.datatype
         return self._predict(X, delayed=delayed, **kwargs)
diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index 27973f219e..422029b277 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -295,23 +295,15 @@ def predict(self, X, output_class=True, algo='auto', threshold=0.5,
 
         else:
             preds = \
-                self.predict_using_fil(X, output_class=output_class,
-                                       algo=algo,
-                                       threshold=threshold,
-                                       convert_dtype=convert_dtype,
-                                       predict_model="GPU",
-                                       fil_sparse_format=fil_sparse_format,
-                                       delayed=delayed)
+                self._predict_using_fil(X, output_class=output_class,
+                                        algo=algo,
+                                        threshold=threshold,
+                                        convert_dtype=convert_dtype,
+                                        fil_sparse_format=fil_sparse_format,
+                                        delayed=delayed)
 
         return preds
 
-    def predict_using_fil(self, X, delayed, **kwargs):
-        if self.local_model is None:
-            self.local_model = self._concat_treelite_models()
-
-        return self._predict_using_fil(X=X,
-                                       delayed=delayed,
-                                       **kwargs)
     """
     TODO : Update function names used for CPU predict.
         Cuml issue #1854 has been created to track this.
diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py
index dd98660f0c..09b204a455 100755
--- a/python/cuml/dask/ensemble/randomforestregressor.py
+++ b/python/cuml/dask/ensemble/randomforestregressor.py
@@ -270,20 +270,13 @@ def predict(self, X, predict_model="GPU", algo='auto',
 
         else:
             preds = \
-                self.predict_using_fil(X, predict_model=predict_model,
-                                       algo=algo,
-                                       convert_dtype=convert_dtype,
-                                       fil_sparse_format=fil_sparse_format,
-                                       delayed=delayed)
+                self._predict_using_fil(X,
+                                        algo=algo,
+                                        convert_dtype=convert_dtype,
+                                        fil_sparse_format=fil_sparse_format,
+                                        delayed=delayed)
         return preds
 
-    def predict_using_fil(self, X, delayed, **kwargs):
-        if self.local_model is None:
-            self.local_model = self._concat_treelite_models()
-        return self._predict_using_fil(X=X,
-                                       delayed=delayed,
-                                       **kwargs)
-
     """
     TODO : Update function names used for CPU predict.
            Cuml issue #1854 has been created to track this.
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index d9a313e65d..06b6817a1b 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -184,37 +184,38 @@ class BaseRandomForestModel(Base):
         return self.treelite_serialized_model
 
     def _obtain_treelite_handle(self):
+        assert self.treelite_serialized_model or self.rf_forest, \
+            "Attempting to create treelite from un-fit forest."
+
+        cdef ModelHandle tl_handle = NULL
         if self.treelite_handle:
             return self.treelite_handle  # Use cached version
-        cdef ModelHandle tl_handle = NULL
 
-        if self.treelite_serialized_model:  # bytes -> Treelite
+        elif self.treelite_serialized_model:  # bytes -> Treelite
             tl_handle = <ModelHandle><uintptr_t>treelite_deserialize(
                 self.treelite_serialized_model)
 
-        assert self.treelite_serialized_model or self.rf_forest, \
-            "Attempting to create treelite from un-fit forest."
-
-        if self.RF_type == CLASSIFICATION:
-            if self.num_classes > 2:
-                raise NotImplementedError("Pickling for multi-class "
-                                          "classification models is currently"
-                                          "  not implemented. Please check"
-                                          "  cuml GitHub issue #1679 for more"
-                                          "  information.")
-            build_treelite_forest(
-                &tl_handle,
-                <RandomForestMetaData[float, int]*><size_t> self.rf_forest,
-                <int> self.n_cols,
-                <int> self.num_classes,
-                model_pbuf_vec)
         else:
-            build_treelite_forest(
-                &tl_handle,
-                <RandomForestMetaData[float, float]*><size_t> self.rf_forest,
-                <int> self.n_cols,
-                <int> REGRESSION_MODEL,
-                model_pbuf_vec)
+            if self.RF_type == CLASSIFICATION:
+                if self.num_classes > 2:
+                    raise NotImplementedError(
+                        "Pickling for multi-class classification models"
+                        " is currently not implemented. Please check"
+                        " cuml GitHub issue #1679 for more information.")
+
+                build_treelite_forest(
+                    &tl_handle,
+                    <RandomForestMetaData[float, int]*>
+                    <uintptr_t> self.rf_forest,
+                    <int> self.n_cols,
+                    <int> self.num_classes)
+            else:
+                build_treelite_forest(
+                    &tl_handle,
+                    <RandomForestMetaData[float, float]*>
+                    <uintptr_t> self.rf_forest,
+                    <int> self.n_cols,
+                    <int> REGRESSION_MODEL)
 
         self.treelite_handle = <uintptr_t> tl_handle
         return self.treelite_handle
@@ -298,10 +299,9 @@ class BaseRandomForestModel(Base):
 
         return self
 
-    def _predict_model_on_gpu(self, X, output_class,
-                              threshold, algo,
-                              num_classes, convert_dtype,
-                              fil_sparse_format, predict_proba):
+    def _predict_model_on_gpu(self, X, algo, convert_dtype,
+                              fil_sparse_format, threshold=0.5,
+                              output_class=False, predict_proba=False):
         out_type = self._get_output_type(X)
         _, n_rows, n_cols, dtype = \
             input_to_cuml_array(X, order='F',

From 845ec9368b61ba127a4e5b9e81be81f0bc1bf099 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 19 Jun 2020 11:52:16 -0500
Subject: [PATCH 26/32] remove debugging code from base

---
 python/cuml/dask/ensemble/base.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
index e4b3dc9e75..0777841c2b 100644
--- a/python/cuml/dask/ensemble/base.py
+++ b/python/cuml/dask/ensemble/base.py
@@ -15,7 +15,6 @@
 
 import dask
 import math
-from dask.distributed import wait
 
 from cuml.dask.common.input_utils import DistributedDataHandler, \
     concatenate
@@ -125,7 +124,6 @@ def _concat_treelite_models(self):
         model._concatenate_treelite_handle(all_tl_mod_handles)
         for tl_handle in all_tl_mod_handles:
             TreeliteModel.free_treelite_model(tl_handle)
-        wait(model)
         return model
 
     def _predict_using_fil(self, X, delayed, **kwargs):

From 8fdccd4a0284601c312ecbd99ebf2f45dacc829d Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 19 Jun 2020 12:29:16 -0500
Subject: [PATCH 27/32] update fil to accept string and boolean

---
 python/cuml/ensemble/randomforest_common.pyx |  4 ++--
 python/cuml/fil/fil.pyx                      | 22 ++++++++++----------
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 06b6817a1b..f77436c5b2 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -386,7 +386,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
         or algo='auto'
     Returns
     ----------
-    fil_sparse_format as a string
+    fil_sparse_format
     """
     accepted_fil_spars_format = {True, False, 'auto'}
 
@@ -406,7 +406,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format):
             "supported. Please refer to the documentation at "
             "(https://docs.rapids.ai/api/cuml/nightly/api.html"
             "#forest-inferencing) to see the accepted values.")
-    return str(fil_sparse_format)
+    return fil_sparse_format
 
 
 def _obtain_fil_model(treelite_handle, depth,
diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx
index 1901d3e61f..7e7a92b584 100644
--- a/python/cuml/fil/fil.pyx
+++ b/python/cuml/fil/fil.pyx
@@ -506,12 +506,12 @@ class ForestInference(Base):
         threshold : float (default=0.5)
             Threshold is used to for classification. It is applied
             only if output_class == True, else it is ignored.
-        storage_type : string (default='auto')
+        storage_type : string or boolean (default='auto')
             In-memory storage format to be used for the FIL model.
              'auto' - choose the storage type automatically
                       (currently DENSE is always used)
-             'False' - create a dense forest
-             'True' - create a sparse forest;
+             False - create a dense forest
+             True - create a sparse forest;
                       requires algo='NAIVE' or algo='AUTO'
 
         Returns
@@ -523,12 +523,12 @@ class ForestInference(Base):
         if isinstance(model, TreeliteModel):
             # TreeliteModel defined in this file
             return self._impl.load_from_treelite_model(
-                model, output_class, algo, threshold, storage_type)
+                model, output_class, algo, threshold, str(storage_type))
         else:
             # assume it is treelite.Model
             return self._impl.load_from_treelite_model_handle(
                 model.handle.value, output_class, algo, threshold,
-                storage_type)
+                str(storage_type))
 
     @staticmethod
     def load_from_sklearn(skl_model,
@@ -561,12 +561,12 @@ class ForestInference(Base):
         threshold : float (default=0.5)
             Threshold is used to for classification. It is applied
             only if output_class == True, else it is ignored.
-        storage_type : string (default='auto')
+        storage_type : string or boolean (default='auto')
             In-memory storage format to be used for the FIL model.
              'auto' - choose the storage type automatically
                       (currently DENSE is always used)
-             'False' - create a dense forest
-             'True' - create a sparse forest;
+             False - create a dense forest
+             True - create a sparse forest;
                       requires algo='NAIVE' or algo='AUTO'
 
         Returns
@@ -584,7 +584,7 @@ class ForestInference(Base):
         tl_model = tl_skl.import_model(skl_model)
         cuml_fm.load_from_treelite_model(
             tl_model, algo=algo, output_class=output_class,
-            storage_type=storage_type, threshold=threshold)
+            storage_type=str(storage_type), threshold=threshold)
         return cuml_fm
 
     @staticmethod
@@ -632,7 +632,7 @@ class ForestInference(Base):
         cuml_fm.load_from_treelite_model(tl_model,
                                          algo=algo,
                                          output_class=output_class,
-                                         storage_type=storage_type,
+                                         storage_type=str(storage_type),
                                          threshold=threshold)
         return cuml_fm
 
@@ -673,4 +673,4 @@ class ForestInference(Base):
         return self._impl.load_using_treelite_handle(model_handle,
                                                      output_class,
                                                      algo, threshold,
-                                                     storage_type)
+                                                     str(storage_type))

From 49915f126b5fc4ef997f3b71408f6b5c9576f2d8 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Fri, 19 Jun 2020 13:34:58 -0500
Subject: [PATCH 28/32] update fil tests

---
 python/cuml/test/test_fil.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/python/cuml/test/test_fil.py b/python/cuml/test/test_fil.py
index 281ba71742..856d78ac25 100644
--- a/python/cuml/test/test_fil.py
+++ b/python/cuml/test/test_fil.py
@@ -190,16 +190,15 @@ def test_fil_regression(n_rows, n_columns, num_rounds, tmp_path, max_depth):
 @pytest.mark.parametrize('n_columns', [20])
 @pytest.mark.parametrize('n_estimators', [1, 10])
 @pytest.mark.parametrize('max_depth', [2, 10, 20])
-@pytest.mark.parametrize('storage_type', ['False', 'True'])
+@pytest.mark.parametrize('storage_type', [False, True])
 @pytest.mark.parametrize('model_class',
                          [GradientBoostingClassifier, RandomForestClassifier])
 @pytest.mark.xfail(not check_min_treelite_version(),
                    reason="need to install treelite version 0.90")
 def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
                                 storage_type, model_class):
-
     # skip depth 20 for dense tests
-    if max_depth == 20 and storage_type == 'False':
+    if max_depth == 20 and not storage_type:
         return
 
     # settings
@@ -236,7 +235,7 @@ def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
 
     skl_acc = accuracy_score(y_validation, skl_preds > 0.5)
 
-    algo = 'NAIVE' if storage_type == 'True' else 'BATCH_TREE_REORG'
+    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'
 
     fm = ForestInference.load_from_sklearn(skl_model,
                                            algo=algo,
@@ -260,7 +259,7 @@ def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth,
 @pytest.mark.parametrize('n_columns', [20])
 @pytest.mark.parametrize('n_estimators', [1, 10])
 @pytest.mark.parametrize('max_depth', [2, 10, 20])
-@pytest.mark.parametrize('storage_type', ['False', 'True'])
+@pytest.mark.parametrize('storage_type', [False, True])
 @pytest.mark.parametrize('model_class',
                          [GradientBoostingRegressor, RandomForestRegressor])
 @pytest.mark.xfail(not check_min_treelite_version(),
@@ -269,7 +268,7 @@ def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth,
                             storage_type, model_class):
 
     # skip depth 20 for dense tests
-    if max_depth == 20 and storage_type == 'False':
+    if max_depth == 20 and not storage_type:
         return
 
     # settings
@@ -303,7 +302,7 @@ def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth,
 
     skl_mse = mean_squared_error(y_validation, skl_preds)
 
-    algo = 'NAIVE' if storage_type == 'True' else 'BATCH_TREE_REORG'
+    algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG'
 
     fm = ForestInference.load_from_sklearn(skl_model,
                                            algo=algo,
@@ -355,8 +354,7 @@ def test_output_algos(algo, small_classifier_and_preds):
 
 @pytest.mark.skipif(has_xgboost() is False, reason="need to install xgboost")
 @pytest.mark.parametrize('storage_type',
-                         ['AUTO', 'False', 'True', 'auto', 'dense',
-                          'True'])
+                         [False, True, 'auto'])
 def test_output_storage_type(storage_type, small_classifier_and_preds):
     model_path, X, xgb_preds = small_classifier_and_preds
     fm = ForestInference.load(model_path,

From 4e350d53cdd4dd59ae99c8ce3cadd4f1c06af594 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Sun, 28 Jun 2020 23:26:28 -0500
Subject: [PATCH 29/32] update CHANGELOG.md

---
 CHANGELOG.md | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index eda5bc861e..946cb7ab54 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,11 +26,6 @@
 - PR #2340: Import ARIMA in the root init file and fix the `test_fit_function` test
 - PR #2408: Install meta packages for dependencies
 - PR #2417: Move doc customization scripts to Jenkins
-<<<<<<< HEAD
-- PR #2411 Refactor Mixin classes and use in classifier/regressor estimators
-- PR #2237: Refactor RF cython code
-- PR #2403 Support for input and output type consistency in logistic regression predict_proba
-=======
 - PR #2433: Add libcumlprims_mg to CMake
 - PR #2420: Add and set convert_dtype default to True in estimator fit methods
 - PR #2411: Refactor Mixin classes and use in classifier/regressor estimators
@@ -40,7 +35,7 @@
 - PR #2440: Use Treelite Conda package
 - PR #2403: Support for input and output type consistency in logistic regression predict_proba
 - PR #2468: Add `_n_features_in_` attribute to all single GPU estimators that implement fit
->>>>>>> 119f8b61d7613b50fec63be10633415101c978a5
+- PR #2237: Refactor RF cython code
 
 ## Bug Fixes
 - PR #2369: Update RF code to fix set_params memory leak

From 78e8b34e0dbeaf2d46bf03de8ec1e11ec0478e65 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Tue, 30 Jun 2020 21:38:02 -0500
Subject: [PATCH 30/32] update benchmark algo.py file

---
 python/cuml/benchmark/algorithms.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py
index e2d90f57bd..351d3b23f0 100644
--- a/python/cuml/benchmark/algorithms.py
+++ b/python/cuml/benchmark/algorithms.py
@@ -396,7 +396,7 @@ def all_algorithms():
                 fil_algo="AUTO",
                 output_class=False,
                 threshold=0.5,
-                storage_type="AUTO",
+                storage_type="auto",
             ),
             name="FIL",
             accepts_labels=False,

From ddc5910abc9751924386b41f623fe0b31fc0b681 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Thu, 2 Jul 2020 17:46:59 -0500
Subject: [PATCH 31/32] removed unnecessary cimport from common.pyx

---
 python/cuml/ensemble/randomforest_common.pyx | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 4e765ed9ae..51b77de285 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -33,8 +33,6 @@ from cuml.ensemble.randomforest_shared import treelite_serialize, \
 from cuml.ensemble.randomforest_shared cimport *
 from cuml.common import input_to_cuml_array, rmm_cupy_ary
 
-cimport cython
-
 
 class BaseRandomForestModel(Base):
     variables = ['n_estimators', 'max_depth', 'handle',

From 33a0d898ec00dea9c51a1fa457fefb6c425477e7 Mon Sep 17 00:00:00 2001
From: salonijain27 <salj7856@gmail.com>
Date: Mon, 13 Jul 2020 01:44:30 -0500
Subject: [PATCH 32/32] resolve merge conflicts

---
 python/cuml/dask/ensemble/base.py                   | 7 +++++--
 python/cuml/dask/ensemble/randomforestclassifier.py | 5 ++---
 python/cuml/ensemble/randomforest_common.pyx        | 2 ++
 python/cuml/ensemble/randomforestregressor.pyx      | 1 +
 python/cuml/test/dask/test_random_forest.py         | 2 +-
 5 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py
index 090feabb2f..35eaad828d 100644
--- a/python/cuml/dask/ensemble/base.py
+++ b/python/cuml/dask/ensemble/base.py
@@ -85,6 +85,7 @@ def _estimators_per_worker(self, n_estimators):
 
     def _fit(self, model, dataset, convert_dtype):
         data = DistributedDataHandler.create(dataset, client=self.client)
+        print(" data : ", data)
         self.datatype = data.datatype
         if self.datatype == 'cudf':
             has_float64 = (dataset[0].dtypes.any() == np.float64)
@@ -101,6 +102,7 @@ def _fit(self, model, dataset, convert_dtype):
                 len(dask.array.unique(labels).compute())
         labels = self.client.persist(dataset[1])
         futures = list()
+        print(" data.worker_to_parts.items() : ", data.worker_to_parts.items())
         for idx, (worker, worker_data) in \
                 enumerate(data.worker_to_parts.items()):
             futures.append(
@@ -112,6 +114,7 @@ def _fit(self, model, dataset, convert_dtype):
                     workers=[worker],
                     pure=False)
             )
+            print(" futures : ", futures)
         wait_and_raise_from_futures(futures)
         return self
 
@@ -142,8 +145,8 @@ def _concat_treelite_models(self):
         return model
 
     def _predict_using_fil(self, X, delayed, **kwargs):
-        if self.local_model is None:
-            self.local_model = self._concat_treelite_models()
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
         data = DistributedDataHandler.create(X, client=self.client)
         self.datatype = data.datatype
         if self.local_model is None:
diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py
index 95ef5883be..e919d33f4f 100755
--- a/python/cuml/dask/ensemble/randomforestclassifier.py
+++ b/python/cuml/dask/ensemble/randomforestclassifier.py
@@ -438,9 +438,8 @@ def predict_proba(self, X,
         y : NumPy
            Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes)
         """
-        if self.local_model is None:
-            self.local_model = self._concat_treelite_models()
-
+        if self._get_internal_model() is None:
+            self._set_internal_model(self._concat_treelite_models())
         data = DistributedDataHandler.create(X, client=self.client)
         self.datatype = data.datatype
         return self._predict_proba(X, delayed, **kwargs)
diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx
index 51b77de285..487fbdfba0 100644
--- a/python/cuml/ensemble/randomforest_common.pyx
+++ b/python/cuml/ensemble/randomforest_common.pyx
@@ -227,6 +227,8 @@ class BaseRandomForestModel(Base):
         X_m, self.n_rows, self.n_cols, self.dtype = \
             input_to_cuml_array(X, check_dtype=[np.float32, np.float64],
                                 order='F')
+        print(" shape of input data, rows : ", self.n_rows)
+        print(" shape of input data, cols : ", self.n_cols)
         if self.n_bins > self.n_rows:
             raise ValueError("The number of bins,`n_bins` can not be greater"
                              " than the number of samples used for training.")
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 18351f8150..fff60902d7 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -442,6 +442,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin):
         # make sure that the `fit` is complete before the following delete
         # call happens
         self.handle.sync()
+        print(" fit model : ", self.print_summary())
         del X_m
         del y_m
         return self
diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py
index a6f8ad927d..0e4a393733 100644
--- a/python/cuml/test/dask/test_random_forest.py
+++ b/python/cuml/test/dask/test_random_forest.py
@@ -293,7 +293,7 @@ def test_rf_concatenation_dask(client, model_type):
     res1 = cu_rf_mg.predict(X_df)
     res1.compute()
     local_tl = TreeliteModel.from_treelite_model_handle(
-        cu_rf_mg.local_model._obtain_treelite_handle(),
+        cu_rf_mg.internal_model._obtain_treelite_handle(),
         take_handle_ownership=False)
 
     assert local_tl.num_trees == n_estimators