From 0b4ccb5230d538889666086cc544f6d0f4cae961 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Mon, 11 May 2020 10:40:45 -0500 Subject: [PATCH 01/32] update cython code --- python/cuml/ensemble/randomforest_common.pyx | 400 ++++++++++++++++++ python/cuml/ensemble/randomforest_shared.pxd | 3 + .../cuml/ensemble/randomforestclassifier.pyx | 373 +++++----------- .../cuml/ensemble/randomforestregressor.pyx | 300 +++---------- 4 files changed, 566 insertions(+), 510 deletions(-) create mode 100644 python/cuml/ensemble/randomforest_common.pyx diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx new file mode 100644 index 0000000000..cb180402c0 --- /dev/null +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -0,0 +1,400 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ctypes +import cupy as cp +from cuml import ForestInference +from cuml.fil.fil import TreeliteModel as tl +from cuml.common.handle import Handle +from cuml.common.base import Base + +from cuml.ensemble.randomforest_shared cimport * +from cuml.utils import input_to_cuml_array, rmm_cupy_ary + +cimport cython + + +class BaseRandomForestModel(Base): + variables = ['n_estimators', 'max_depth', 'handle', + 'max_features', 'n_bins', + 'split_algo', 'split_criterion', 'min_rows_per_node', + 'min_impurity_decrease', + 'bootstrap', 'bootstrap_features', + 'verbose', 'rows_sample', + 'max_leaves', 'quantile_per_tree'] + + def _create_model(self, model, seed, split_criterion, + n_streams, n_estimators=100, + max_depth=16, handle=None, max_features='auto', + n_bins=8, split_algo=1, bootstrap=True, + bootstrap_features=False, + verbose=False, min_rows_per_node=2, + rows_sample=1.0, max_leaves=-1, + accuracy_metric=None, dtype=None, + output_type=None, min_samples_leaf=None, + min_weight_fraction_leaf=None, n_jobs=None, + max_leaf_nodes=None, min_impurity_decrease=0.0, + min_impurity_split=None, oob_score=None, + random_state=None, warm_start=None, class_weight=None, + quantile_per_tree=False, criterion=None): + + if accuracy_metric: + model.variables.append('accuracy_metric') + sklearn_params = {"criterion": criterion, + "min_samples_leaf": min_samples_leaf, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + "max_leaf_nodes": max_leaf_nodes, + "min_impurity_split": min_impurity_split, + "oob_score": oob_score, "n_jobs": n_jobs, + "random_state": random_state, + "warm_start": warm_start, + "class_weight": class_weight} + + for key, vals in sklearn_params.items(): + if vals is not None: + raise TypeError(" The Scikit-learn variable ", key, + " is not supported in cuML," + " please read the cuML documentation for" + " more information") + + if handle is None: + handle = Handle(n_streams) + + super(model, self).__init__(handle=handle, + verbose=verbose, + output_type=output_type) + if max_depth < 0: + raise ValueError("Must specify max_depth >0 ") + + self.split_algo = split_algo + criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, + '3': MAE, '4': CRITERION_END} + if str(split_criterion) not in criterion_dict.keys(): + warnings.warn("The split criterion chosen was not present" + " in the list of options accepted by the model" + " and so the CRITERION_END option has been chosen.") + self.split_criterion = CRITERION_END + else: + self.split_criterion = criterion_dict[str(split_criterion)] + + self.min_rows_per_node = min_rows_per_node + self.min_impurity_decrease = min_impurity_decrease + self.bootstrap_features = bootstrap_features + self.rows_sample = rows_sample + self.max_leaves = max_leaves + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.bootstrap = bootstrap + self.verbose = verbose + self.n_bins = n_bins + self.n_cols = None + self.dtype = dtype + self.accuracy_metric = accuracy_metric + self.quantile_per_tree = quantile_per_tree + self.n_streams = handle.getNumInternalStreams() + self.seed = seed + self.model_pbuf_bytes = [] + # if self.model_type == curfr: + # print have a check for the random forest meta data in init + """ + def _check_rf_meta_data_format(self, task_category): + if task_category == CLASSIFICATION + """ + def _get_max_feat_val(self): + if type(self.max_features) == int: + return self.max_features/self.n_cols + elif type(self.max_features) == float: + return self.max_features + elif self.max_features == 'sqrt': + return 1/np.sqrt(self.n_cols) + elif self.max_features == 'log2': + return math.log2(self.n_cols)/self.n_cols + elif self.max_features == 'auto': + if self.RF_type == CLASSIFICATION: + return 1/np.sqrt(self.n_cols) + else: + return 1.0 + else: + raise ValueError("Wrong value passed in for max_features" + " please read the documentation") + + def check_rf_metadata_type(self): + cdef RandomForestMetaData[float, int] *rf_forest_class + cdef RandomForestMetaData[double, int] *rf_forest64_class + cdef RandomForestMetaData[float, float] *rf_forest_reg + cdef RandomForestMetaData[double, double] *rf_forest64_reg + if self.RF_type == CLASSIFICATION: + rf_forest_class = \ + new RandomForestMetaData[float, int]() + self.rf_forest = rf_forest_class + rf_forest64_class = \ + new RandomForestMetaData[double, int]() + self.rf_forest64 = rf_forest64_class + else: + rf_forest_reg = \ + new RandomForestMetaData[float, float]() + self.rf_forest = rf_forest_reg + rf_forest64_reg = \ + new RandomForestMetaData[double, double]() + self.rf_forest64 = rf_forest64_reg + + def fit_setup(self, X, y, convert_dtype): + self._set_output_type(X) + + # Reset the old tree data for new fit call + self._reset_forest_data() + + #cdef uintptr_t X_ptr, y_ptr + + X_m, self.n_rows, self.n_cols, self.dtype = \ + input_to_cuml_array(X, check_dtype=[np.float32, np.float64], + order='F') + X_ptr = X_m.ptr + print(" type pf X_ptr in common : ", type(X_ptr)) + if self.RF_type == CLASSIFICATION: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, check_dtype=np.int32, + convert_to_dtype=(np.int32 if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + if y_dtype != np.int32: + raise TypeError("The labels `y` need to be of dtype `np.int32`") + unique_labels = rmm_cupy_ary(cp.unique, y_m) + self.num_classes = len(unique_labels) + for i in range(self.num_classes): + if i not in unique_labels: + raise ValueError("The labels need " + "to be consecutive values from " + "0 to the number of unique label values") + else: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, + convert_to_dtype=(self.dtype if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + y_ptr = y_m.ptr + + if self.dtype == np.float64: + warnings.warn("To use GPU-based prediction, first train using \ + float 32 data to fit the estimator.") + + max_feature_val = self._get_max_feat_val() + if type(self.min_rows_per_node) == float: + self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows) + + """ + cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest + cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest64 + if self.RF_type == CLASSIFICATION: + *rf_forest = \ + new RandomForestMetaData[float, int]() + self.rf_forest = rf_forest + *rf_forest64 = \ + new RandomForestMetaData[double, int]() + self.rf_forest64 = rf_forest64 + else: + *rf_forest = \ + new RandomForestMetaData[float, float]() + self.rf_forest = rf_forest + *rf_forest64 = \ + new RandomForestMetaData[double, double]() + self.rf_forest64 = rf_forest64 + + if self.dtype == np.float32: + fit(handle_[0], + rf_forest, + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + rf_params, + self.verbosity) + else: + rf_params64 = rf_params + fit(handle_[0], + rf_forest64, + X_ptr, + self.n_rows, + self.n_cols, + y_ptr, + rf_params64, + self.verbosity) + """ + return X_m, y_m, max_feature_val + + def _predict_model_on_gpu(self, model, X, algo, convert_dtype, + fil_sparse_format, threshold=0.5, + output_class=False, predict_proba=False): + out_type = self._get_output_type(X) + cdef ModelHandle cuml_model_ptr = NULL + _, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='F', + check_cols=self.n_cols) + + if dtype == np.float64 and not convert_dtype: + raise TypeError("GPU based predict only accepts np.float32 data. \ + Please set convert_dtype=True to convert the test \ + data to the same dtype as the data used to train, \ + ie. np.float32. If you would like to use test \ + data of dtype=np.float64 please set \ + predict_model='CPU' to use the CPU implementation \ + of predict.") + + model._obtain_treelite_handle() + storage_type = \ + _check_fil_parameter_validity(depth=self.max_depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(self.treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_type) + + preds = tl_to_fil_model.predict(X, output_type=out_type, + predict_proba=predict_proba) + tl.free_treelite_model(self.treelite_handle) + return preds + + def _get_params(self, model, deep): + params = dict() + for key in model.variables: + if key in ['handle']: + continue + var_value = getattr(self, key, None) + params[key] = var_value + return params + + def _set_params(self, model, **params): + self.handle.__setstate__(self.n_streams) + self.model_pbuf_bytes = [] + + if not params: + return self + for key, value in params.items(): + if key not in model.variables: + raise ValueError('Invalid parameter for estimator') + else: + setattr(self, key, value) + return self + + """ + def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type): + cdef ModelHandle cuml_model_ptr = NULL + cdef rf_class_float *rf_forest_class + cdef rf_reg_float *rf_forest_reg + if task_category == CLASSIFICATION: + rf_forest_class = \ + self.rf_forest + + else: + rf_forest_reg = \ + self.rf_forest + build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr, + rf_forest_reg, + self.n_cols, + task_category, + self.model_pbuf_bytes) + mod_ptr = cuml_model_ptr + treelite_handle = ctypes.c_void_p(mod_ptr).value + return treelite_handle + + """ + def _get_protobuf_bytes_common(self, model): + fit_mod_ptr = model._obtain_treelite_handle() + cdef uintptr_t model_ptr = fit_mod_ptr + model_protobuf_bytes = save_model( model_ptr) + return model_protobuf_bytes + + +def _check_fil_parameter_validity(depth, algo, fil_sparse_format): + storage_format = _check_fil_sparse_format_value(fil_sparse_format) + if (depth > 16 and (storage_format == 'dense' or + algo == 'tree_reorg' or + algo == 'batch_tree_reorg')): + raise ValueError("While creating a forest with max_depth greater " + "than 16, `fil_sparse_format` should be True. " + "If `fil_sparse_format=False` then the memory" + "consumed while creating the FIL forest is very " + "large and the process will be aborted. In " + "addition, `algo` must be either set to `naive' " + "or `auto` to set 'fil_sparse_format=True`.") + return storage_format + + +def _check_fil_sparse_format_value(fil_sparse_format): + accepted_vals = [True, False, 'auto'] + if fil_sparse_format == 'auto': + storage_format = fil_sparse_format + elif not fil_sparse_format: + storage_format = 'dense' + elif fil_sparse_format not in accepted_vals: + raise ValueError("The value entered for spares_forest is not " + "supported. Please refer to the documentation " + "to see the accepted values.") + else: + storage_format = 'sparse' + + return storage_format + + +def _obtain_treelite_model(treelite_handle): + """ + Creates a Treelite model using the treelite handle + obtained from the cuML Random Forest model. + + Returns + ---------- + tl_to_fil_model : Treelite version of this model + """ + treelite_model = \ + tl.from_treelite_model_handle(treelite_handle) + return treelite_model + + +def _obtain_fil_model(treelite_handle, depth, + output_class=True, + threshold=0.5, algo='auto', + fil_sparse_format='auto'): + """ + Creates a Forest Inference (FIL) model using the treelite + handle obtained from the cuML Random Forest model. + + Returns + ---------- + fil_model : + A Forest Inference model which can be used to perform + inferencing on the random forest model. + """ + + storage_format = \ + _check_fil_parameter_validity(depth=depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_format) + + return tl_to_fil_model diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 4ea76c92b7..10c0657030 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -37,6 +37,8 @@ from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \ input_to_dev_array, zeros cimport cuml.common.handle cimport cuml.common.cuda +cimport cython + cdef extern from "treelite/c_api.h": ctypedef void* ModelHandle @@ -127,3 +129,4 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": cdef ModelHandle concatenate_trees( vector[ModelHandle] &treelite_handles) except + + diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 0bfebaf950..aea7768da1 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -1,3 +1,4 @@ + # # Copyright (c) 2019-2020, NVIDIA CORPORATION. # @@ -38,6 +39,8 @@ from cuml import ForestInference from cuml.common.array import CumlArray from cuml.common.base import Base from cuml.common.handle import Handle +from cuml.ensemble.randomforest_common import BaseRandomForestModel + from cuml.common.handle cimport cumlHandle from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model @@ -50,6 +53,7 @@ from numba import cuda cimport cuml.common.handle cimport cuml.common.cuda +cimport cython cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": @@ -120,7 +124,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": bool) except + -class RandomForestClassifier(Base): +class RandomForestClassifier(BaseRandomForestModel): """ Implements a Random Forest classifier model which fits multiple decision tree classifiers in an ensemble. @@ -219,88 +223,20 @@ class RandomForestClassifier(Base): seed : int (default = None) Seed for the random number generator. Unseeded by default. """ + def __init__(self, split_criterion=0, seed=None, + n_streams=8, **kwargs): - variables = ['n_estimators', 'max_depth', 'handle', - 'max_features', 'n_bins', - 'split_algo', 'split_criterion', 'min_rows_per_node', - 'min_impurity_decrease', - 'bootstrap', 'bootstrap_features', - 'verbose', 'rows_sample', - 'max_leaves', 'quantile_per_tree'] - - def __init__(self, n_estimators=100, max_depth=16, handle=None, - max_features='auto', n_bins=8, n_streams=8, - split_algo=1, split_criterion=0, min_rows_per_node=2, - bootstrap=True, bootstrap_features=False, - type_model="classifier", verbose=False, - rows_sample=1.0, max_leaves=-1, quantile_per_tree=False, - output_type=None, criterion=None, dtype=None, - min_samples_leaf=None, min_weight_fraction_leaf=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, n_jobs=None, - random_state=None, warm_start=None, class_weight=None, - seed=None): - sklearn_params = {"criterion": criterion, - "min_samples_leaf": min_samples_leaf, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "max_leaf_nodes": max_leaf_nodes, - "min_impurity_split": min_impurity_split, - "oob_score": oob_score, "n_jobs": n_jobs, - "random_state": random_state, - "warm_start": warm_start, - "class_weight": class_weight} - - for key, vals in sklearn_params.items(): - if vals is not None: - raise TypeError("The Scikit-learn variable", key, - " is not supported in cuML," - " please read the cuML documentation for" - " more information") - - if max_depth < 0: - raise ValueError("Must specify max_depth >0") - - if handle is None: - handle = Handle(n_streams) - - super(RandomForestClassifier, self).__init__(handle=handle, - verbose=verbose, - output_type=output_type) - - self.split_algo = split_algo - criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} - if str(split_criterion) not in criterion_dict.keys(): - warnings.warn("The split criterion chosen was not present" - " in the list of options accepted by the model" - " and so the CRITERION_END option has been chosen.") - self.split_criterion = CRITERION_END - else: - self.split_criterion = criterion_dict[str(split_criterion)] - - self.min_rows_per_node = min_rows_per_node - self.min_impurity_decrease = min_impurity_decrease - self.bootstrap_features = bootstrap_features - self.rows_sample = rows_sample - self.max_leaves = max_leaves - self.n_estimators = n_estimators - self.max_depth = max_depth - self.max_features = max_features - self.bootstrap = bootstrap - self.verbose = verbose - self.n_bins = n_bins - self.quantile_per_tree = quantile_per_tree - self.n_cols = None - self.dtype = None - self.n_streams = handle.getNumInternalStreams() - self.seed = seed - self.num_classes = 2 if ((seed is not None) and (n_streams != 1)): warnings.warn("For reproducible results, n_streams==1 is " "recommended. If n_streams is > 1, results may vary " "due to stream/thread timing differences, even when " "random_seed is set") - self.model_pbuf_bytes = [] + self.RF_type = CLASSIFICATION + self.num_classes = 2 + self._create_model(model=RandomForestClassifier, + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + **kwargs) """ TODO: @@ -319,10 +255,10 @@ class RandomForestClassifier(Base): if self.n_cols: # only if model has been fit previously self.model_pbuf_bytes = self._get_protobuf_bytes() - params_t = self.rf_forest + params_t = self.rf_forest rf_forest = \ params_t - params_t64 = self.rf_forest64 + params_t64 = self.rf_forest64 rf_forest64 = \ params_t64 if self.dtype == np.float32: @@ -347,10 +283,10 @@ class RandomForestClassifier(Base): if self.n_cols: if state["dtype"] == np.float32: rf_forest.rf_params = state["rf_params"] - state["rf_forest"] = rf_forest + state["rf_forest"] = rf_forest else: rf_forest64.rf_params = state["rf_params64"] - state["rf_forest64"] = rf_forest64 + state["rf_forest64"] = rf_forest64 self.model_pbuf_bytes = state["model_pbuf_bytes"] self.__dict__.update(state) @@ -358,60 +294,74 @@ class RandomForestClassifier(Base): def __del__(self): if self.n_cols: if self.dtype == np.float32: - free( + free( self.rf_forest) else: - free( + free( self.rf_forest64) def _reset_forest_data(self): # Only if model is fitted before # Clears the data of the forest to prepare for next fit if self.n_cols: - free( + free( self.rf_forest) - free( + free( self.rf_forest64) - def _get_max_feat_val(self): - if type(self.max_features) == int: - return self.max_features/self.n_cols - elif type(self.max_features) == float: - return self.max_features - elif self.max_features == 'sqrt' or self.max_features == 'auto': - return 1/np.sqrt(self.n_cols) - elif self.max_features == 'log2': - return math.log2(self.n_cols)/self.n_cols - else: - raise ValueError("Wrong value passed in for max_features" - " please read the documentation") - def _obtain_treelite_handle(self): - task_category = CLASSIFICATION_MODEL + cdef ModelHandle cuml_model_ptr = NULL + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest if self.num_classes > 2: raise NotImplementedError("Pickling for multi-class " "classification models is currently not " "implemented. Please check cuml issue " "#1679 for more information.") - - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - build_treelite_forest(& cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - return treelite_handle + cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes + cdef vector[unsigned char] model_pbuf_vec + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + if self.treelite_handle is None: + build_treelite_forest( + & cuml_model_ptr, + rf_forest, + self.n_cols, + self.num_classes, + model_pbuf_vec) + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + return self.treelite_handle def _get_protobuf_bytes(self): - fit_mod_ptr = self._obtain_treelite_handle() + """ + Returns the self.model_pbuf_bytes. + Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite + models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model + to a treelite protobuf model which is stored in a temporary file. + The protobuf model information is read from the temporary file and + the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite + protobuf model bytes are stored in `self.model_pbuf_bytes`. If either + of information is already present in the model then the respective + step is skipped. + """ + if self.model_pbuf_bytes: + return self.model_pbuf_bytes + elif self.treelite_handle: + fit_mod_ptr = self.treelite_handle + else: + fit_mod_ptr = self._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr - model_protobuf_bytes = save_model( model_ptr) - - return model_protobuf_bytes + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self.model_pbuf_bytes def convert_to_treelite_model(self): """ @@ -421,8 +371,8 @@ class RandomForestClassifier(Base): ---------- tl_to_fil_model : Treelite version of this model """ - treelite_handle = self._obtain_treelite_handle() - return _obtain_treelite_model(treelite_handle) + handle = self._obtain_treelite_handle() + return _obtain_treelite_model(handle) def convert_to_fil_model(self, output_class=True, threshold=0.5, algo='auto', @@ -470,7 +420,6 @@ class RandomForestClassifier(Base): A Forest Inference model which can be used to perform inferencing on the random forest model. """ - treelite_handle = self._obtain_treelite_handle() return _obtain_fil_model(treelite_handle=treelite_handle, depth=self.max_depth, @@ -479,8 +428,6 @@ class RandomForestClassifier(Base): algo=algo, fil_sparse_format=fil_sparse_format) - return tl_to_fil_model - """ TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. @@ -488,14 +435,14 @@ class RandomForestClassifier(Base): def _tl_model_handles(self, model_bytes): cdef ModelHandle cuml_model_ptr = NULL cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + self.rf_forest task_category = CLASSIFICATION_MODEL build_treelite_forest(& cuml_model_ptr, rf_forest, self.n_cols, task_category, model_bytes) - mod_handle = cuml_model_ptr + mod_handle = cuml_model_ptr return ctypes.c_void_p(mod_handle).value @@ -510,14 +457,14 @@ class RandomForestClassifier(Base): mod_ptr)) concat_model_handle = concatenate_trees(deref(model_handles)) - - concat_model_ptr = concat_model_handle - return ctypes.c_void_p(concat_model_ptr).value - - def _concatenate_model_bytes(self, concat_model_handle): - cdef uintptr_t model_ptr = concat_model_handle - concat_model_bytes = save_model( model_ptr) - self._model_pbuf_bytes = concat_model_bytes + cdef uintptr_t concat_model_ptr = concat_model_handle + self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( concat_model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self def fit(self, X, y, convert_dtype=False): """ @@ -540,53 +487,19 @@ class RandomForestClassifier(Base): memory used for the method. """ - self._set_output_type(X) - - # Reset the old tree data for new fit call - self._reset_forest_data() - cdef uintptr_t X_ptr, y_ptr - - X_m, n_rows, self.n_cols, self.dtype = \ - input_to_cuml_array(X, check_dtype=[np.float32, np.float64], - order='F') + X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype) X_ptr = X_m.ptr - - y_m, _, _, y_dtype = \ - input_to_cuml_array(y, check_dtype=np.int32, - convert_to_dtype=(np.int32 if convert_dtype - else None), - check_rows=n_rows, check_cols=1) y_ptr = y_m.ptr - if y_dtype != np.int32: - raise TypeError("The labels `y` need to be of dtype `np.int32`") - - if self.dtype == np.float64: - warnings.warn("To use GPU-based prediction, first train \ - using float 32 data to fit the estimator.") - cdef cumlHandle* handle_ =\ self.handle.getHandle() - unique_labels = rmm_cupy_ary(cp.unique, y_m) - num_unique_labels = len(unique_labels) - - for i in range(num_unique_labels): - if i not in unique_labels: - raise ValueError("The labels need " - "to be consecutive values from " - "0 to the number of unique label values") - - max_feature_val = self._get_max_feat_val() - if type(self.min_rows_per_node) == float: - self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows) - cdef RandomForestMetaData[float, int] *rf_forest = \ new RandomForestMetaData[float, int]() - self.rf_forest = rf_forest + self.rf_forest = rf_forest cdef RandomForestMetaData[double, int] *rf_forest64 = \ new RandomForestMetaData[double, int]() - self.rf_forest64 = rf_forest64 + self.rf_forest64 = rf_forest64 if self.seed is None: seed_val = NULL @@ -608,14 +521,15 @@ class RandomForestClassifier(Base): self.split_criterion, self.quantile_per_tree, self.n_streams) + if self.dtype == np.float32: fit(handle_[0], rf_forest, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, - num_unique_labels, + self.num_classes, rf_params, self.verbosity) @@ -624,10 +538,10 @@ class RandomForestClassifier(Base): fit(handle_[0], rf_forest64, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, - num_unique_labels, + self.num_classes, rf_params64, self.verbosity) @@ -638,59 +552,10 @@ class RandomForestClassifier(Base): # make sure that the `fit` is complete before the following delete # call happens self.handle.sync() - del(X_m) - del(y_m) - self.num_classes = num_unique_labels + del X_m + del y_m return self - def _predict_model_on_gpu(self, X, output_class, - threshold, algo, - num_classes, convert_dtype, - fil_sparse_format, predict_proba): - out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL - _, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='F', - check_cols=self.n_cols) - - if dtype == np.float64 and not convert_dtype: - raise TypeError("GPU based predict only accepts np.float32 data. \ - Please set convert_dtype=True to convert the test \ - data to the same dtype as the data used to train, \ - ie. np.float32. If you would like to use test \ - data of dtype=np.float64 please set \ - predict_model='CPU' to use the CPU implementation \ - of predict.") - - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - - build_treelite_forest(& cuml_model_ptr, - rf_forest, - n_cols, - num_classes, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - - storage_type = \ - _check_fil_parameter_validity(depth=self.max_depth, - fil_sparse_format=fil_sparse_format, - algo=algo) - - fil_model = ForestInference() - tl_to_fil_model = \ - fil_model.load_from_randomforest(treelite_handle, - output_class=output_class, - threshold=threshold, - algo=algo, - storage_type=storage_type) - - preds = tl_to_fil_model.predict(X, output_type=out_type, - predict_proba=predict_proba) - tl.free_treelite_model(treelite_handle) - return preds - def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr @@ -705,13 +570,13 @@ class RandomForestClassifier(Base): cdef uintptr_t preds_ptr = preds.ptr cdef cumlHandle* handle_ =\ - self.handle.getHandle() + self.handle.getHandle() cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + self.rf_forest cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 + self.rf_forest64 if self.dtype == np.float32: predict(handle_[0], rf_forest, @@ -817,10 +682,10 @@ class RandomForestClassifier(Base): else: preds = \ - self._predict_model_on_gpu(X, output_class=output_class, + self._predict_model_on_gpu(model=RandomForestClassifier, + X=X, output_class=output_class, threshold=threshold, algo=algo, - num_classes=num_classes, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format, predict_proba=False) @@ -856,12 +721,12 @@ class RandomForestClassifier(Base): preds_ptr = preds.ptr cdef cumlHandle* handle_ =\ - self.handle.getHandle() + self.handle.getHandle() cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + self.rf_forest cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 + self.rf_forest64 if self.dtype == np.float32: predictGetAll(handle_[0], rf_forest, @@ -1050,13 +915,13 @@ class RandomForestClassifier(Base): preds_ptr = preds_m.ptr cdef cumlHandle* handle_ =\ - self.handle.getHandle() + self.handle.getHandle() cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + self.rf_forest cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 + self.rf_forest64 if self.dtype == np.float32: self.stats = score(handle_[0], @@ -1091,13 +956,10 @@ class RandomForestClassifier(Base): ----------- deep : boolean (default = True) """ - params = dict() - for key in RandomForestClassifier.variables: - if key in ['handle']: - continue - var_value = getattr(self, key, None) - params[key] = var_value - return params + + + return self._get_params(model=RandomForestClassifier, + deep=deep) def set_params(self, **params): """ @@ -1110,27 +972,20 @@ class RandomForestClassifier(Base): params : dict of new params """ # Resetting handle as __setstate__ overwrites with handle=None - self.handle.__setstate__(self.n_streams) - self.model_pbuf_bytes = [] - - if not params: - return self - for key, value in params.items(): - if key not in RandomForestClassifier.variables: - raise ValueError('Invalid parameter for estimator') - else: - setattr(self, key, value) - return self + + + return self._set_params(model=RandomForestClassifier, + **params) def print_summary(self): """ Prints the summary of the forest used to train and test the model """ - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + cdef RandomForestMetaData[float, float] *rf_forest = \ + self.rf_forest - cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 + cdef RandomForestMetaData[double, double] *rf_forest64 = \ + self.rf_forest64 if self.dtype == np.float64: print_rf_summary(rf_forest64) @@ -1142,11 +997,11 @@ class RandomForestClassifier(Base): Prints the detailed information about the forest used to train and test the Random Forest model """ - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest + cdef RandomForestMetaData[float, float] *rf_forest = \ + self.rf_forest - cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 + cdef RandomForestMetaData[double, double] *rf_forest64 = \ + self.rf_forest64 if self.dtype == np.float64: print_rf_detailed(rf_forest64) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 0feb5dabf0..2d64f0718a 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -34,9 +34,10 @@ from cuml import ForestInference from cuml.common.array import CumlArray from cuml.common.base import Base from cuml.common.handle import Handle +from cuml.ensemble.randomforest_common import BaseRandomForestModel from cuml.common.handle cimport cumlHandle from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ - _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model + _obtain_treelite_model, _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel as tl @@ -51,7 +52,7 @@ cimport cuml.common.cuda cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": - + cdef void fit(cumlHandle & handle, RandomForestMetaData[float, float]*, float*, @@ -69,7 +70,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": double*, RF_params, int) except + - + cdef void predict(cumlHandle& handle, RandomForestMetaData[float, float] *, float*, @@ -101,7 +102,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int) except + -class RandomForestRegressor(Base): +class RandomForestRegressor(BaseRandomForestModel): """ Implements a Random Forest regressor model which fits multiple decision @@ -207,87 +208,19 @@ class RandomForestRegressor(Base): currently fully guarantee the exact same results. """ - variables = ['n_estimators', 'max_depth', 'handle', - 'max_features', 'n_bins', - 'split_algo', 'split_criterion', 'min_rows_per_node', - 'min_impurity_decrease', - 'bootstrap', 'bootstrap_features', - 'verbose', 'rows_sample', - 'max_leaves', 'quantile_per_tree', - 'accuracy_metric'] - - def __init__(self, n_estimators=100, max_depth=16, handle=None, - max_features='auto', n_bins=8, n_streams=8, - split_algo=1, split_criterion=2, - bootstrap=True, bootstrap_features=False, - verbose=False, min_rows_per_node=2, - rows_sample=1.0, max_leaves=-1, - accuracy_metric='mse', output_type=None, - min_samples_leaf=None, dtype=None, - min_weight_fraction_leaf=None, n_jobs=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, - random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None, seed=None): - sklearn_params = {"criterion": criterion, - "min_samples_leaf": min_samples_leaf, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "max_leaf_nodes": max_leaf_nodes, - "min_impurity_split": min_impurity_split, - "oob_score": oob_score, "n_jobs": n_jobs, - "random_state": random_state, - "warm_start": warm_start, - "class_weight": class_weight} - - for key, vals in sklearn_params.items(): - if vals is not None: - raise TypeError(" The Scikit-learn variable ", key, - " is not supported in cuML," - " please read the cuML documentation for" - " more information") - - if handle is None: - handle = Handle(n_streams) - - super(RandomForestRegressor, self).__init__(handle=handle, - verbose=verbose, - output_type=output_type) - - if max_depth < 0: - raise ValueError("Must specify max_depth >0 ") - - self.split_algo = split_algo - criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} - if str(split_criterion) not in criterion_dict.keys(): - warnings.warn("The split criterion chosen was not present" - " in the list of options accepted by the model" - " and so the CRITERION_END option has been chosen.") - self.split_criterion = CRITERION_END - else: - self.split_criterion = criterion_dict[str(split_criterion)] - - self.min_rows_per_node = min_rows_per_node - self.min_impurity_decrease = min_impurity_decrease - self.bootstrap_features = bootstrap_features - self.rows_sample = rows_sample - self.max_leaves = max_leaves - self.n_estimators = n_estimators - self.max_depth = max_depth - self.max_features = max_features - self.bootstrap = bootstrap - self.verbose = verbose - self.n_bins = n_bins - self.n_cols = None - self.dtype = None - self.accuracy_metric = accuracy_metric - self.quantile_per_tree = quantile_per_tree - self.n_streams = handle.getNumInternalStreams() - self.seed = seed + def __init__(self, split_criterion=2, seed=None, + accuracy_metric='mse', n_streams=8, + **kwargs): + if ((seed is not None) and (n_streams != 1)): warnings.warn("Setting the random seed does not fully guarantee" " the exact same results at this time.") - self.model_pbuf_bytes = [] + self.RF_type = REGRESSION + self._create_model(model=RandomForestRegressor, + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + accuracy_metric=accuracy_metric, + **kwargs) """ TODO: @@ -358,34 +291,26 @@ class RandomForestRegressor(Base): free( self.rf_forest64) - def _get_max_feat_val(self): - if type(self.max_features) == int: - return self.max_features/self.n_cols - elif type(self.max_features) == float: - return self.max_features - elif self.max_features == 'sqrt': - return 1/np.sqrt(self.n_cols) - elif self.max_features == 'auto': - return 1.0 - elif self.max_features == 'log2': - return math.log2(self.n_cols)/self.n_cols - else: - raise ValueError("Wrong value passed in for max_features" - " please read the documentation") - def _obtain_treelite_handle(self): - task_category = REGRESSION_MODEL cdef ModelHandle cuml_model_ptr = NULL cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - build_treelite_forest(& cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - return treelite_handle + self.rf_forest + cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes + cdef vector[unsigned char] model_pbuf_vec + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + if self.treelite_handle is None: + task_category = REGRESSION_MODEL + build_treelite_forest( + & cuml_model_ptr, + rf_forest, + self.n_cols, + task_category, + model_pbuf_vec) + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + return self.treelite_handle def _get_protobuf_bytes(self): fit_mod_ptr = self._obtain_treelite_handle() @@ -404,60 +329,12 @@ class RandomForestRegressor(Base): treelite_handle = self._obtain_treelite_handle() return _obtain_treelite_model(treelite_handle) - def convert_to_fil_model(self, output_class=False, - algo='auto', - fil_sparse_format='auto'): - """ - Create a Forest Inference (FIL) model from the trained cuML - Random Forest model. - - Parameters - ---------- - output_class : boolean (default = True) - This is optional and required only while performing the - predict operation on the GPU. - If true, return a 1 or 0 depending on whether the raw - prediction exceeds the threshold. If False, just return - the raw prediction. - algo : string (default = 'auto') - This is optional and required only while performing the - predict operation on the GPU. - 'naive' - simple inference using shared memory - 'tree_reorg' - similar to naive but trees rearranged to be more - coalescing-friendly - 'batch_tree_reorg' - similar to tree_reorg but predicting - multiple rows per thread block - `auto` - choose the algorithm automatically. Currently - 'batch_tree_reorg' is used for dense storage - and 'naive' for sparse storage - fil_sparse_format : boolean or string (default = auto) - This variable is used to choose the type of forest that will be - created in the Forest Inference Library. It is not required - while using predict_model='CPU'. - 'auto' - choose the storage type automatically - (currently True is chosen by auto) - False - create a dense forest - True - create a sparse forest, requires algo='naive' - or algo='auto' - - Returns - ---------- - fil_model : - A Forest Inference model which can be used to perform - inferencing on the random forest model. - - """ - treelite_handle = self._obtain_treelite_handle() - return _obtain_fil_model(treelite_handle=treelite_handle, - depth=self.max_depth, - output_class=output_class, - algo=algo, - fil_sparse_format=fil_sparse_format) """ TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ + """ def _tl_model_handles(self, model_bytes): task_category = REGRESSION_MODEL cdef ModelHandle tl_model_ptr = NULL @@ -468,7 +345,7 @@ class RandomForestRegressor(Base): self.n_cols, task_category, model_bytes) - mod_handle = tl_model_ptr + mod_handle = tl_model_ptr return ctypes.c_void_p(mod_handle).value @@ -491,6 +368,7 @@ class RandomForestRegressor(Base): cdef uintptr_t model_ptr = concat_model_handle concat_model_bytes = save_model( model_ptr) self.model_pbuf_bytes = concat_model_bytes + """ def fit(self, X, y, convert_dtype=False): """ @@ -508,35 +386,13 @@ class RandomForestRegressor(Base): ndarray, cuda array interface compliant array like CuPy These labels should be contiguous integers from 0 to n_classes. """ - self._set_output_type(X) - - # Reset the old tree data for new fit call - self._reset_forest_data() - cdef uintptr_t X_ptr, y_ptr - - X_m, n_rows, self.n_cols, self.dtype = \ - input_to_cuml_array(X, check_dtype=[np.float32, np.float64], - order='F') + X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype) X_ptr = X_m.ptr - y_m, _, _, y_dtype = \ - input_to_cuml_array(y, - convert_to_dtype=(self.dtype if convert_dtype - else None), - check_rows=n_rows, check_cols=1) y_ptr = y_m.ptr - - if self.dtype == np.float64: - warnings.warn("To use GPU-based prediction, first train using \ - float 32 data to fit the estimator.") - cdef cumlHandle* handle_ =\ self.handle.getHandle() - max_feature_val = self._get_max_feat_val() - if type(self.min_rows_per_node) == float: - self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows) - cdef RandomForestMetaData[float, float] *rf_forest = \ new RandomForestMetaData[float, float]() self.rf_forest = rf_forest @@ -569,7 +425,7 @@ class RandomForestRegressor(Base): fit(handle_[0], rf_forest, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, rf_params, @@ -580,7 +436,7 @@ class RandomForestRegressor(Base): fit(handle_[0], rf_forest64, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, rf_params64, @@ -588,55 +444,10 @@ class RandomForestRegressor(Base): # make sure that the `fit` is complete before the following delete # call happens self.handle.sync() - del(X_m) - del(y_m) + del X_m + del y_m return self - def _predict_model_on_gpu(self, X, algo, convert_dtype, - fil_sparse_format): - out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL - _, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='F', - check_cols=self.n_cols) - - if dtype == np.float64 and not convert_dtype: - raise TypeError("GPU based predict only accepts np.float32 data. \ - Please set convert_dtype=True to convert the test \ - data to the same dtype as the data used to train, \ - ie. np.float32. If you would like to use test \ - data of dtype=np.float64 please set \ - predict_model='CPU' to use the CPU implementation \ - of predict.") - - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - - task_category = REGRESSION_MODEL - build_treelite_forest(& cuml_model_ptr, - rf_forest, - n_cols, - task_category, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - - storage_type = \ - _check_fil_parameter_validity(depth=self.max_depth, - fil_sparse_format=fil_sparse_format, - algo=algo) - - fil_model = ForestInference() - tl_to_fil_model = \ - fil_model.load_from_randomforest(treelite_handle, - output_class=False, - algo=algo, - storage_type=storage_type) - - preds = tl_to_fil_model.predict(X, out_type) - tl.free_treelite_model(treelite_handle) - return preds - def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr @@ -744,8 +555,11 @@ class RandomForestRegressor(Base): setting predict_model = 'CPU'") else: - preds = self._predict_model_on_gpu(X, algo, convert_dtype, - fil_sparse_format) + preds = self._predict_model_on_gpu(model=RandomForestRegressor, + X=X, + algo=algo, + convert_dtype=convert_dtype, + fil_sparse_format=fil_sparse_format) return preds @@ -861,14 +675,8 @@ class RandomForestRegressor(Base): ----------- deep : boolean (default = True) """ - - params = dict() - for key in RandomForestRegressor.variables: - if key in ['handle']: - continue - var_value = getattr(self, key, None) - params[key] = var_value - return params + return self._get_params(model=RandomForestRegressor, + deep=deep) def set_params(self, **params): """ @@ -881,18 +689,8 @@ class RandomForestRegressor(Base): params : dict of new params """ # Resetting handle as __setstate__ overwrites with handle=None - self.handle.__setstate__(self.n_streams) - self.model_pbuf_bytes = [] - - if not params: - return self - for key, value in params.items(): - if key not in RandomForestRegressor.variables: - raise ValueError('Invalid parameter for estimator') - else: - setattr(self, key, value) - - return self + return self._set_params(model=RandomForestRegressor, + **params) def print_summary(self): """ From 9a9800adf5d12157b3b1cce0f6fddd1532b8439a Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Tue, 12 May 2020 20:15:46 -0500 Subject: [PATCH 02/32] update cython code --- python/cuml/ensemble/randomforest_common.pyx | 135 +++++++----------- python/cuml/ensemble/randomforest_shared.pxd | 37 ++++- .../cuml/ensemble/randomforestclassifier.pyx | 31 +--- .../cuml/ensemble/randomforestregressor.pyx | 30 +--- 4 files changed, 90 insertions(+), 143 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 4ff32cae5a..54f9f48414 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -16,16 +16,25 @@ import ctypes import cupy as cp +import math +import warnings + +import numpy as np from cuml import ForestInference from cuml.fil.fil import TreeliteModel as tl from cuml.common.handle import Handle from cuml.common.base import Base from cuml.ensemble.randomforest_shared cimport * -from cuml.utils import input_to_cuml_array, rmm_cupy_ary +from cuml.common import input_to_cuml_array, rmm_cupy_ary cimport cython +#RandomForestMetaData[X_dtype, y_dtype] *meta +#cdef creat_meta(X_dtype a, y_dtype b): +# meta = new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]() + #return meta + class BaseRandomForestModel(Base): variables = ['n_estimators', 'max_depth', 'handle', @@ -108,6 +117,7 @@ class BaseRandomForestModel(Base): self.n_streams = handle.getNumInternalStreams() self.seed = seed self.model_pbuf_bytes = bytearray() + self.treelite_handle = None # if self.model_type == curfr: # print have a check for the random forest meta data in init """ @@ -132,39 +142,53 @@ class BaseRandomForestModel(Base): raise ValueError("Wrong value passed in for max_features" " please read the documentation") - def check_rf_metadata_type(self): - cdef RandomForestMetaData[float, int] *rf_forest_class - cdef RandomForestMetaData[double, int] *rf_forest64_class - cdef RandomForestMetaData[float, float] *rf_forest_reg - cdef RandomForestMetaData[double, double] *rf_forest64_reg + def _obtain_treelite_handle(self): + cdef ModelHandle cuml_model_ptr = NULL + cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes + cdef vector[unsigned char] model_pbuf_vec + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + print(self.RF_type) + cdef cython.float a + cdef cython.type b if self.RF_type == CLASSIFICATION: - rf_forest_class = \ - new RandomForestMetaData[float, int]() - self.rf_forest = rf_forest_class - rf_forest64_class = \ - new RandomForestMetaData[double, int]() - self.rf_forest64 = rf_forest64_class + meta = create_meta(a, b) else: - rf_forest_reg = \ - new RandomForestMetaData[float, float]() - self.rf_forest = rf_forest_reg - rf_forest64_reg = \ - new RandomForestMetaData[double, double]() - self.rf_forest64 = rf_forest64_reg - - def fit_setup(self, X, y, convert_dtype): + meta = create_meta(a, a) + # self.rf_forest + #cdef object (*meta_info)(float, int) + #meta_info = create_meta + + #cdef RandomForestMetaData[cython.typeof(a), cython.typeof(b)] *meta_info = get_meta_data[cython.typeof(a), cython.typeof(b)]( self.rf_forest) + # self.forest #get_meta_data[float, int]( self.rf_forest) + #cdef RandomForestMetaData[float, int] *rf_forest + #cdef fused_rf_meta *forest = \ + # self.rf_forest + if self.treelite_handle is None: + build_treelite_forest( + & cuml_model_ptr, + meta, + self.n_cols, + self.num_classes, + model_pbuf_vec) + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + + return self.treelite_handle + + def _dataset_setup(self, X, y, convert_dtype): self._set_output_type(X) # Reset the old tree data for new fit call self._reset_forest_data() - #cdef uintptr_t X_ptr, y_ptr - X_m, self.n_rows, self.n_cols, self.dtype = \ input_to_cuml_array(X, check_dtype=[np.float32, np.float64], order='F') - X_ptr = X_m.ptr - print(" type pf X_ptr in common : ", type(X_ptr)) if self.RF_type == CLASSIFICATION: y_m, _, _, y_dtype = \ input_to_cuml_array(y, check_dtype=np.int32, @@ -186,7 +210,6 @@ class BaseRandomForestModel(Base): convert_to_dtype=(self.dtype if convert_dtype else None), check_rows=self.n_rows, check_cols=1) - y_ptr = y_m.ptr if self.dtype == np.float64: warnings.warn("To use GPU-based prediction, first train using \ @@ -196,44 +219,6 @@ class BaseRandomForestModel(Base): if type(self.min_rows_per_node) == float: self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows) - """ - cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest - cdef RandomForestMetaData[cython.floating, cython.numeric] *rf_forest64 - if self.RF_type == CLASSIFICATION: - *rf_forest = \ - new RandomForestMetaData[float, int]() - self.rf_forest = rf_forest - *rf_forest64 = \ - new RandomForestMetaData[double, int]() - self.rf_forest64 = rf_forest64 - else: - *rf_forest = \ - new RandomForestMetaData[float, float]() - self.rf_forest = rf_forest - *rf_forest64 = \ - new RandomForestMetaData[double, double]() - self.rf_forest64 = rf_forest64 - - if self.dtype == np.float32: - fit(handle_[0], - rf_forest, - X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - rf_params, - self.verbosity) - else: - rf_params64 = rf_params - fit(handle_[0], - rf_forest64, - X_ptr, - self.n_rows, - self.n_cols, - y_ptr, - rf_params64, - self.verbosity) - """ return X_m, y_m, max_feature_val def _predict_model_on_gpu(self, model, X, algo, convert_dtype, @@ -254,7 +239,7 @@ class BaseRandomForestModel(Base): predict_model='CPU' to use the CPU implementation \ of predict.") - model._obtain_treelite_handle() + self._obtain_treelite_handle() storage_type = \ _check_fil_parameter_validity(depth=self.max_depth, fil_sparse_format=fil_sparse_format, @@ -295,28 +280,6 @@ class BaseRandomForestModel(Base): setattr(self, key, value) return self - """ - def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type): - cdef ModelHandle cuml_model_ptr = NULL - cdef rf_class_float *rf_forest_class - cdef rf_reg_float *rf_forest_reg - if task_category == CLASSIFICATION: - rf_forest_class = \ - self.rf_forest - - else: - rf_forest_reg = \ - self.rf_forest - build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr, - rf_forest_reg, - self.n_cols, - task_category, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - return treelite_handle - - """ def _get_protobuf_bytes_common(self, model): fit_mod_ptr = model._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 976c85e7d0..e7074e91e9 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -33,7 +33,7 @@ from cuml.common.handle import Handle from cuml import ForestInference from cuml.common.base import Base from cuml.common.handle cimport cumlHandle -from cuml.common import get_cudf_column_ptr, get_dev_array_ptr, \ +from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \ input_to_dev_array, zeros cimport cuml.common.handle cimport cuml.common.cuda @@ -47,7 +47,7 @@ cdef extern from "treelite/c_api.h": ModelHandle model) cdef const char* TreeliteGetLastError() -cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": +cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: cdef enum CRITERION: GINI, ENTROPY, @@ -55,7 +55,15 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": MAE, CRITERION_END -cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree": +cdef extern from "cuml/tree/flatnode.h" namespace "ML::Flatnode" nogil: + cdef cppclass SparseTreeNode[T, L]: + L prediction + int colid + T quesval + T best_metric_val + int left_child_id + +cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil: cdef struct DecisionTreeParams: int max_depth int max_leaves @@ -67,7 +75,15 @@ cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree": bool quantile_per_tree CRITERION split_criterion -cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": + cdef cppclass TreeMetaDataNode[T, L]: + int treeid + int depth_counter + int leaf_counter + double prepare_time + double train_time + vector[SparseTreeNode[T, L]] sparsetree + +cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: cdef enum RF_type: CLASSIFICATION, @@ -92,18 +108,29 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": pass cdef cppclass RandomForestMetaData[T, L]: - void* trees + ctypedef TreeMetaDataNode[T, L]* trees RF_params rf_params + + ctypedef fused fused_rf_meta: + RandomForestMetaData[float, float] + RandomForestMetaData[double, double] + RandomForestMetaData[float, int] + RandomForestMetaData[double, int] + + cdef fused_rf_meta *meta + # # Treelite handling # + cdef void build_treelite_forest[T, L](ModelHandle*, RandomForestMetaData[T, L]*, int, int, vector[unsigned char] &) except + + cdef vector[unsigned char] save_model_protobuf(ModelHandle) except + cdef void print_rf_summary[T, L](RandomForestMetaData[T, L]*) except + diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index b5b56f322f..3dbb0cedf5 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -309,31 +309,6 @@ class RandomForestClassifier(BaseRandomForestModel): free( self.rf_forest64) - def _obtain_treelite_handle(self): - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - if self.num_classes > 2: - raise NotImplementedError("Pickling for multi-class " - "classification models is currently not " - "implemented. Please check cuml issue " - "#1679 for more information.") - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - if self.treelite_handle is None: - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - self.num_classes, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - return self.treelite_handle - def _get_protobuf_bytes(self): """ Returns the self.model_pbuf_bytes. @@ -488,7 +463,7 @@ class RandomForestClassifier(BaseRandomForestModel): """ cdef uintptr_t X_ptr, y_ptr - X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype) + X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) X_ptr = X_m.ptr y_ptr = y_m.ptr cdef cumlHandle* handle_ =\ @@ -828,10 +803,10 @@ class RandomForestClassifier(BaseRandomForestModel): "implemented. Please check cuml issue " "#1679 for more information.") preds_proba = \ - self._predict_model_on_gpu(X, output_class=output_class, + self._predict_model_on_gpu(model=RandomForestClassifier, + X=X, output_class=output_class, threshold=threshold, algo=algo, - num_classes=num_classes, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format, predict_proba=True) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index feae6d5b87..6fa4090e6a 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -21,7 +21,6 @@ import ctypes import cudf -import math import numpy as np import warnings @@ -41,8 +40,7 @@ from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel as tl -from cuml.common import input_to_cuml_array, input_to_dev_array, \ - zeros, get_cudf_column_ptr +from cuml.common import input_to_cuml_array from cython.operator cimport dereference as deref from numba import cuda @@ -291,27 +289,11 @@ class RandomForestRegressor(BaseRandomForestModel): self.rf_forest) free( self.rf_forest64) - + """ def _obtain_treelite_handle(self): - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - if self.treelite_handle is None: - task_category = REGRESSION_MODEL - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - return self.treelite_handle + cdef RandomForestMetaData[float, float] rf_forest + self._get_treelite_handle(rf_forest) + """ def _get_protobuf_bytes(self): """ @@ -411,7 +393,7 @@ class RandomForestRegressor(BaseRandomForestModel): These labels should be contiguous integers from 0 to n_classes. """ cdef uintptr_t X_ptr, y_ptr - X_m, y_m, max_feature_val = self._fit_setup(X, y, convert_dtype) + X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) X_ptr = X_m.ptr y_ptr = y_m.ptr cdef cumlHandle* handle_ =\ From f229b79a4da7b146c4e7a021cf7635be6cff9d36 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Tue, 12 May 2020 20:25:46 -0500 Subject: [PATCH 03/32] update common file --- python/cuml/ensemble/randomforest_common.pyx | 31 ++++++++++++++++---- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 54f9f48414..7d35af96a3 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -153,13 +153,10 @@ class BaseRandomForestModel(Base): mod_ptr = cuml_model_ptr self.treelite_handle = ctypes.c_void_p(mod_ptr).value print(self.RF_type) - cdef cython.float a - cdef cython.type b if self.RF_type == CLASSIFICATION: - meta = create_meta(a, b) + meta = self.rf_forest else: - meta = create_meta(a, a) - # self.rf_forest + meta = self.rf_forest #cdef object (*meta_info)(float, int) #meta_info = create_meta @@ -186,6 +183,8 @@ class BaseRandomForestModel(Base): # Reset the old tree data for new fit call self._reset_forest_data() + #cdef uintptr_t X_ptr, y_ptr + X_m, self.n_rows, self.n_cols, self.dtype = \ input_to_cuml_array(X, check_dtype=[np.float32, np.float64], order='F') @@ -280,6 +279,28 @@ class BaseRandomForestModel(Base): setattr(self, key, value) return self + """ + def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type): + cdef ModelHandle cuml_model_ptr = NULL + cdef rf_class_float *rf_forest_class + cdef rf_reg_float *rf_forest_reg + if task_category == CLASSIFICATION: + rf_forest_class = \ + self.rf_forest + + else: + rf_forest_reg = \ + self.rf_forest + build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr, + rf_forest_reg, + self.n_cols, + task_category, + self.model_pbuf_bytes) + mod_ptr = cuml_model_ptr + treelite_handle = ctypes.c_void_p(mod_ptr).value + return treelite_handle + + """ def _get_protobuf_bytes_common(self, model): fit_mod_ptr = model._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr From a77a468b4c815ccd4140423730a47a300367e293 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 13 May 2020 07:28:56 -0500 Subject: [PATCH 04/32] ypdate predict func --- cpp/include/cuml/ensemble/randomforest.hpp | 16 +-- cpp/src/randomforest/randomforest.cu | 25 ++-- python/cuml/ensemble/randomforest_common.pyx | 108 +++++++++++++++++- python/cuml/ensemble/randomforest_shared.pxd | 7 ++ .../cuml/ensemble/randomforestclassifier.pyx | 78 ++----------- .../cuml/ensemble/randomforestregressor.pyx | 77 ++----------- 6 files changed, 154 insertions(+), 157 deletions(-) diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index 8f8b04a5d2..f3ddcec3ad 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -144,14 +144,17 @@ void fit(const cumlHandle& user_handle, RandomForestClassifierD*& forest, int n_unique_labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); +template void predict(const cumlHandle& user_handle, - const RandomForestClassifierF* forest, const float* input, - int n_rows, int n_cols, int* predictions, + const RandomForestMetaData* forest, const T* input, + int n_rows, int n_cols, L* predictions, int verbosity = CUML_LEVEL_INFO); +/** void predict(const cumlHandle& user_handle, const RandomForestClassifierD* forest, const double* input, int n_rows, int n_cols, int* predictions, int verbosity = CUML_LEVEL_INFO); +*/ void predictGetAll(const cumlHandle& user_handle, const RandomForestClassifierF* forest, const float* input, @@ -190,13 +193,10 @@ void fit(const cumlHandle& user_handle, RandomForestRegressorD*& forest, double* input, int n_rows, int n_cols, double* labels, RF_params rf_params, int verbosity = CUML_LEVEL_INFO); +template void predict(const cumlHandle& user_handle, - const RandomForestRegressorF* forest, const float* input, - int n_rows, int n_cols, float* predictions, - int verbosity = CUML_LEVEL_INFO); -void predict(const cumlHandle& user_handle, - const RandomForestRegressorD* forest, const double* input, - int n_rows, int n_cols, double* predictions, + const RandomForestMetaData* forest, const T* input, + int n_rows, int n_cols, T* predictions, int verbosity = CUML_LEVEL_INFO); RF_metrics score(const cumlHandle& user_handle, diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 4287e40a96..ee3ffea291 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -526,16 +526,18 @@ void fit(const cumlHandle& user_handle, RandomForestClassifierD*& forest, * @param[in] verbosity: verbosity level for logging messages during execution * @{ */ +template void predict(const cumlHandle& user_handle, - const RandomForestClassifierF* forest, const float* input, - int n_rows, int n_cols, int* predictions, int verbosity) { + const RandomForestMetaData* forest, const T* input, + int n_rows, int n_cols, L* predictions, int verbosity) { ASSERT(forest->trees, "Cannot predict! No trees in the forest."); - std::shared_ptr> rf_classifier = - std::make_shared>(forest->rf_params); + std::shared_ptr> rf_classifier = + std::make_shared>(forest->rf_params); rf_classifier->predict(user_handle, input, n_rows, n_cols, predictions, forest, verbosity); } +/** void predict(const cumlHandle& user_handle, const RandomForestClassifierD* forest, const double* input, int n_rows, int n_cols, int* predictions, int verbosity) { @@ -545,6 +547,7 @@ void predict(const cumlHandle& user_handle, rf_classifier->predict(user_handle, input, n_rows, n_cols, predictions, forest, verbosity); } +*/ /** @} */ /** @@ -689,16 +692,18 @@ void fit(const cumlHandle& user_handle, RandomForestRegressorD*& forest, * @param[in] verbosity: verbosity level for logging messages during execution * @{ */ +template void predict(const cumlHandle& user_handle, - const RandomForestRegressorF* forest, const float* input, - int n_rows, int n_cols, float* predictions, int verbosity) { + const RandomForestMetaData* forest, const T* input, + int n_rows, int n_cols, T* predictions, int verbosity) { ASSERT(forest->trees, "Cannot predict! No trees in the forest."); - std::shared_ptr> rf_regressor = - std::make_shared>(forest->rf_params); + std::shared_ptr> rf_regressor = + std::make_shared>(forest->rf_params); rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, forest, verbosity); } +/** void predict(const cumlHandle& user_handle, const RandomForestRegressorD* forest, const double* input, int n_rows, int n_cols, double* predictions, int verbosity) { @@ -708,6 +713,9 @@ void predict(const cumlHandle& user_handle, rf_regressor->predict(user_handle, input, n_rows, n_cols, predictions, forest, verbosity); } + +*/ + /** @} */ /** @@ -742,6 +750,7 @@ RF_metrics score(const cumlHandle& user_handle, user_handle, ref_labels, n_rows, predictions, verbosity); return regression_score; } + /** @} */ // Functions' specializations diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 7d35af96a3..a8ecdef6fb 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -24,6 +24,9 @@ from cuml import ForestInference from cuml.fil.fil import TreeliteModel as tl from cuml.common.handle import Handle from cuml.common.base import Base +from cuml.common.array import CumlArray + +from cython.operator cimport dereference as deref from cuml.ensemble.randomforest_shared cimport * from cuml.common import input_to_cuml_array, rmm_cupy_ary @@ -152,11 +155,8 @@ class BaseRandomForestModel(Base): mod_ptr = cuml_model_ptr self.treelite_handle = ctypes.c_void_p(mod_ptr).value - print(self.RF_type) - if self.RF_type == CLASSIFICATION: - meta = self.rf_forest - else: - meta = self.rf_forest + meta = self.rf_forest + #cdef object (*meta_info)(float, int) #meta_info = create_meta @@ -220,6 +220,56 @@ class BaseRandomForestModel(Base): return X_m, y_m, max_feature_val + def _get_protobuf_bytes(self): + """ + Returns the self.model_pbuf_bytes. + Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite + models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model + to a treelite protobuf model which is stored in a temporary file. + The protobuf model information is read from the temporary file and + the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite + protobuf model bytes are stored in `self.model_pbuf_bytes`. If either + of information is already present in the model then the respective + step is skipped. + """ + if self.model_pbuf_bytes: + return self.model_pbuf_bytes + elif self.treelite_handle: + fit_mod_ptr = self.treelite_handle + else: + fit_mod_ptr = self._obtain_treelite_handle() + cdef uintptr_t model_ptr = fit_mod_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self.model_pbuf_bytes + + def _concatenate_treelite_handle(self, treelite_handle): + cdef ModelHandle concat_model_handle = NULL + cdef vector[ModelHandle] *model_handles \ + = new vector[ModelHandle]() + cdef uintptr_t mod_ptr + for i in treelite_handle: + mod_ptr = i + model_handles.push_back(( + mod_ptr)) + + concat_model_handle = concatenate_trees(deref(model_handles)) + + cdef uintptr_t concat_model_ptr = concat_model_handle + self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( concat_model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self + def _predict_model_on_gpu(self, model, X, algo, convert_dtype, fil_sparse_format, threshold=0.5, output_class=False, predict_proba=False): @@ -257,6 +307,54 @@ class BaseRandomForestModel(Base): tl.free_treelite_model(self.treelite_handle) return preds + def _predict_model_on_cpu(self, X, convert_dtype): + out_type = self._get_output_type(X) + cdef uintptr_t X_ptr + X_m, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='C', + convert_to_dtype=(self.dtype if convert_dtype + else None), + check_cols=self.n_cols) + X_ptr = X_m.ptr + + preds = CumlArray.zeros(n_rows, dtype=dtype) + cdef uintptr_t preds_ptr = preds.ptr + + cdef cumlHandle* handle_ =\ + self.handle.getHandle() + + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest + + cdef RandomForestMetaData[double, int] *rf_forest64 = \ + self.rf_forest64 + if self.dtype == np.float32: + predict(handle_[0], + rf_forest, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) + + elif self.dtype == np.float64: + predict(handle_[0], + rf_forest64, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) + else: + raise TypeError("supports only float32 and float64 input," + " but input of type '%s' passed." + % (str(self.dtype))) + + self.handle.sync() + # synchronous w/o a stream + del(X_m) + return preds.to_output(out_type) + def _get_params(self, model, deep): params = dict() for key in model.variables: diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index e7074e91e9..d9270a3ead 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -157,3 +157,10 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: cdef ModelHandle concatenate_trees( vector[ModelHandle] &treelite_handles) except + + cdef void predict[T, L](cumlHandle& handle, + RandomForestMetaData[T, L] *, + T*, + int, + int, + L*, + int) except + diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 4f5c9f0e56..5a98251932 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -33,8 +33,6 @@ from libcpp.vector cimport vector from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free -from cython.operator cimport dereference as deref - from cuml import ForestInference from cuml.common.array import CumlArray from cuml.common.base import Base @@ -77,21 +75,13 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": RF_params, int) except + - cdef void predict(cumlHandle& handle, - RandomForestMetaData[float, int] *, - float*, - int, - int, - int*, - bool) except + - - cdef void predict(cumlHandle& handle, - RandomForestMetaData[double, int]*, - double*, - int, - int, - int*, - bool) except + + cdef void predict[T, L](cumlHandle& handle, + RandomForestMetaData[T, L] *, + T*, + int, + int, + L*, + int) except + cdef void predictGetAll(cumlHandle& handle, RandomForestMetaData[float, int] *, @@ -308,37 +298,6 @@ class RandomForestClassifier(BaseRandomForestModel): self.rf_forest) free( self.rf_forest64) - self.treelite_handle = None - self.model_pbuf_bytes = bytearray() - - def _get_protobuf_bytes(self): - """ - Returns the self.model_pbuf_bytes. - Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite - models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. - """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes - elif self.treelite_handle: - fit_mod_ptr = self.treelite_handle - else: - fit_mod_ptr = self._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes def convert_to_treelite_model(self): """ @@ -423,26 +382,6 @@ class RandomForestClassifier(BaseRandomForestModel): return ctypes.c_void_p(mod_handle).value - def _concatenate_treelite_handle(self, treelite_handle): - cdef ModelHandle concat_model_handle = NULL - cdef vector[ModelHandle] *model_handles \ - = new vector[ModelHandle]() - cdef uintptr_t mod_ptr - for i in treelite_handle: - mod_ptr = i - model_handles.push_back(( - mod_ptr)) - - concat_model_handle = concatenate_trees(deref(model_handles)) - cdef uintptr_t concat_model_ptr = concat_model_handle - self.treelite_handle = concat_model_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self - def fit(self, X, y, convert_dtype=False): """ Perform Random Forest Classification on the input data @@ -532,7 +471,7 @@ class RandomForestClassifier(BaseRandomForestModel): del X_m del y_m return self - + """ def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr @@ -580,6 +519,7 @@ class RandomForestClassifier(BaseRandomForestModel): # synchronous w/o a stream del(X_m) return preds.to_output(out_type) + """ def predict(self, X, predict_model="GPU", output_class=True, threshold=0.5, diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 46d43c20a4..b356e8d901 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -68,22 +68,14 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": double*, RF_params, int) except + - - cdef void predict(cumlHandle& handle, - RandomForestMetaData[float, float] *, - float*, - int, - int, - float*, - int) except + - - cdef void predict(cumlHandle& handle, - RandomForestMetaData[double, double]*, - double*, - int, - int, - double*, - int) except + + + cdef void predict[T, T](cumlHandle& handle, + RandomForestMetaData[T, T] *, + T*, + int, + int, + T*, + int) except + cdef RF_metrics score(cumlHandle& handle, RandomForestMetaData[float, float]*, @@ -292,34 +284,6 @@ class RandomForestRegressor(BaseRandomForestModel): self.treelite_handle = None self.model_pbuf_bytes = bytearray() - def _get_protobuf_bytes(self): - """ - Returns the self.model_pbuf_bytes. - Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite - models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. - """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes - elif self.treelite_handle: - fit_mod_ptr = self.treelite_handle - else: - fit_mod_ptr = self._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes def convert_to_treelite_model(self): """ @@ -352,27 +316,6 @@ class RandomForestRegressor(BaseRandomForestModel): return ctypes.c_void_p(mod_handle).value - def _concatenate_treelite_handle(self, treelite_handle): - cdef ModelHandle concat_model_handle = NULL - cdef vector[ModelHandle] *model_handles \ - = new vector[ModelHandle]() - cdef uintptr_t mod_ptr - for i in treelite_handle: - mod_ptr = i - model_handles.push_back(( - mod_ptr)) - - concat_model_handle = concatenate_trees(deref(model_handles)) - - cdef uintptr_t concat_model_ptr = concat_model_handle - self.treelite_handle = concat_model_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self - def fit(self, X, y, convert_dtype=False): """ Perform Random Forest Regression on the input data @@ -461,7 +404,7 @@ class RandomForestRegressor(BaseRandomForestModel): check_cols=self.n_cols) X_ptr = X_m.ptr - preds = CumlArray.zeros(n_rows, dtype=dtype) + preds = CumlArray.zeros(n_rows, dtype=np.int32) cdef uintptr_t preds_ptr = preds.ptr cdef cumlHandle* handle_ =\ @@ -490,7 +433,7 @@ class RandomForestRegressor(BaseRandomForestModel): preds_ptr, self.verbosity) else: - raise TypeError("supports only float32 and float64 input," + raise TypeError("supports only np.float32 and np.float64 input," " but input of type '%s' passed." % (str(self.dtype))) From f4ddf130636b6a7300ea51c46c030da164ce3af0 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Thu, 14 May 2020 08:08:07 -0500 Subject: [PATCH 05/32] created cdef class to assign rfmetadata --- cpp/include/cuml/ensemble/randomforest.hpp | 3 + cpp/src/randomforest/randomforest.cu | 30 ++++ python/cuml/ensemble/randomforest_common.pyx | 170 +++++++----------- python/cuml/ensemble/randomforest_shared.pxd | 26 +-- .../cuml/ensemble/randomforestregressor.pyx | 38 ++-- 5 files changed, 135 insertions(+), 132 deletions(-) diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index f3ddcec3ad..e2cb4e1138 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -131,6 +131,9 @@ ModelHandle concatenate_trees(std::vector treelite_handles); void compare_concat_forest_to_subforests( ModelHandle concat_tree_handle, std::vector treelite_handles); + +template +RandomForestMetaData* create_meta(T a, L b); // ----------------------------- Classification ----------------------------------- // typedef RandomForestMetaData RandomForestClassifierF; diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index ee3ffea291..99beb93d10 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -220,6 +220,11 @@ void print(const RF_params rf_params) { DecisionTree::print(rf_params.tree_params); } +template +RandomForestMetaData* create_meta(T a, L b) { + RandomForestMetaData* rf = new RandomForestMetaData(); + return rf; +} /** * @brief Set the trees pointer of RandomForestMetaData to nullptr. * @param[in, out] forest: CPU pointer to RandomForestMetaData. @@ -754,6 +759,31 @@ RF_metrics score(const cumlHandle& user_handle, /** @} */ // Functions' specializations + +template RandomForestMetaData* create_meta(float a, float b); +template RandomForestMetaData* create_meta(double a, double b); +template RandomForestMetaData* create_meta(float a , int b); +template RandomForestMetaData* create_meta(double a, int b); + + +template void predict( + const cumlHandle& user_handle, + const RandomForestMetaData* forest, const double* input, + int n_rows, int n_cols, int* predictions, int verbosity); +template void predict( + const cumlHandle& user_handle, + const RandomForestMetaData* forest, const float* input, + int n_rows, int n_cols, int* predictions, int verbosity); + +template void predict( + const cumlHandle& user_handle, + const RandomForestMetaData* forest, const float* input, + int n_rows, int n_cols, float* predictions, int verbosity); +template void predict( + const cumlHandle& user_handle, + const RandomForestMetaData* forest, const double* input, + int n_rows, int n_cols, double* predictions, int verbosity); + template void print_rf_summary( const RandomForestClassifierF* forest); template void print_rf_summary( diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index a8ecdef6fb..ee8dcb78e3 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -33,10 +33,56 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary cimport cython -#RandomForestMetaData[X_dtype, y_dtype] *meta -#cdef creat_meta(X_dtype a, y_dtype b): -# meta = new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]() - #return meta + +cdef class BaseRandomForestModel_impl(): + cpdef creat_meta(self, X_dtype a, y_dtype b): + meta = \ + new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]() + cdef RandomForestMetaData[float, int] *rf_class + cdef RandomForestMetaData[float, float] *rf_reg + if cython.typeof(b) == cython.int: + rf_class = self.rf_forest + meta = rf_class + else: + rf_reg = self.rf_forest + meta = rf_reg + + def _obtain_treelite_handle(self): + if self.treelite_handle: + return self.treelite_handle + cdef ModelHandle cuml_model_ptr = NULL + cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes + cdef vector[unsigned char] model_pbuf_vec + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + cdef cython.float a = 10.0 + cdef cython.int b = 5 + base_rf = BaseRandomForestModel_impl() + base_rf.creat_meta(a, b) + if self.RF_type == CLASSIFICATION: + build_treelite_forest( + & cuml_model_ptr, + # self.rf_forest, + meta, + self.n_cols, + self.num_classes, + model_pbuf_vec) + else: + build_treelite_forest( + & cuml_model_ptr, + # self.rf_forest, + meta, + self.n_cols, + self.num_classes, + model_pbuf_vec) + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + + return self.treelite_handle class BaseRandomForestModel(Base): @@ -48,6 +94,9 @@ class BaseRandomForestModel(Base): 'verbose', 'rows_sample', 'max_leaves', 'quantile_per_tree'] + def __init__(self): + self._impl = BaseRandomForestModel_impl() + def _create_model(self, model, seed, split_criterion, n_streams, n_estimators=100, max_depth=16, handle=None, max_features='auto', @@ -146,36 +195,8 @@ class BaseRandomForestModel(Base): " please read the documentation") def _obtain_treelite_handle(self): - cdef ModelHandle cuml_model_ptr = NULL - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - meta = self.rf_forest - - #cdef object (*meta_info)(float, int) - #meta_info = create_meta - - #cdef RandomForestMetaData[cython.typeof(a), cython.typeof(b)] *meta_info = get_meta_data[cython.typeof(a), cython.typeof(b)]( self.rf_forest) - # self.forest #get_meta_data[float, int]( self.rf_forest) - #cdef RandomForestMetaData[float, int] *rf_forest - #cdef fused_rf_meta *forest = \ - # self.rf_forest - if self.treelite_handle is None: - build_treelite_forest( - & cuml_model_ptr, - meta, - self.n_cols, - self.num_classes, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - return self.treelite_handle + return self._impl._obtain_treelite_handle def _dataset_setup(self, X, y, convert_dtype): self._set_output_type(X) @@ -249,6 +270,19 @@ class BaseRandomForestModel(Base): self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) return self.model_pbuf_bytes + def _tl_model_handles(self, model_bytes): + cdef ModelHandle cuml_model_ptr = NULL + meta = self.rf_forest + task_category = CLASSIFICATION_MODEL + build_treelite_forest(& cuml_model_ptr, + meta, + self.n_cols, + task_category, + model_bytes) + mod_handle = cuml_model_ptr + + return ctypes.c_void_p(mod_handle).value + def _concatenate_treelite_handle(self, treelite_handle): cdef ModelHandle concat_model_handle = NULL cdef vector[ModelHandle] *model_handles \ @@ -307,54 +341,6 @@ class BaseRandomForestModel(Base): tl.free_treelite_model(self.treelite_handle) return preds - def _predict_model_on_cpu(self, X, convert_dtype): - out_type = self._get_output_type(X) - cdef uintptr_t X_ptr - X_m, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='C', - convert_to_dtype=(self.dtype if convert_dtype - else None), - check_cols=self.n_cols) - X_ptr = X_m.ptr - - preds = CumlArray.zeros(n_rows, dtype=dtype) - cdef uintptr_t preds_ptr = preds.ptr - - cdef cumlHandle* handle_ =\ - self.handle.getHandle() - - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - - cdef RandomForestMetaData[double, int] *rf_forest64 = \ - self.rf_forest64 - if self.dtype == np.float32: - predict(handle_[0], - rf_forest, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) - - elif self.dtype == np.float64: - predict(handle_[0], - rf_forest64, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) - else: - raise TypeError("supports only float32 and float64 input," - " but input of type '%s' passed." - % (str(self.dtype))) - - self.handle.sync() - # synchronous w/o a stream - del(X_m) - return preds.to_output(out_type) - def _get_params(self, model, deep): params = dict() for key in model.variables: @@ -377,28 +363,6 @@ class BaseRandomForestModel(Base): setattr(self, key, value) return self - """ - def _obtain_treelite_handle_common(self, task_category, rf_meta_type rf_type): - cdef ModelHandle cuml_model_ptr = NULL - cdef rf_class_float *rf_forest_class - cdef rf_reg_float *rf_forest_reg - if task_category == CLASSIFICATION: - rf_forest_class = \ - self.rf_forest - - else: - rf_forest_reg = \ - self.rf_forest - build_treelite_forest[self.dtype, self.y_type](& cuml_model_ptr, - rf_forest_reg, - self.n_cols, - task_category, - self.model_pbuf_bytes) - mod_ptr = cuml_model_ptr - treelite_handle = ctypes.c_void_p(mod_ptr).value - return treelite_handle - - """ def _get_protobuf_bytes_common(self, model): fit_mod_ptr = model._obtain_treelite_handle() cdef uintptr_t model_ptr = fit_mod_ptr diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index d9270a3ead..dc7b197bd2 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -114,12 +114,25 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: ctypedef fused fused_rf_meta: RandomForestMetaData[float, float] - RandomForestMetaData[double, double] + #RandomForestMetaData[double, double] RandomForestMetaData[float, int] - RandomForestMetaData[double, int] + #RandomForestMetaData[double, int] - cdef fused_rf_meta *meta + ctypedef fused X_dtype: + cython.float + cython.double + ctypedef fused y_dtype: + cython.int + cython.float + cython.double + + cpdef fused_rf_meta *meta + + cdef RandomForestMetaData[T, L]* create_meta[T, L](T a, L b) + #cdef fused_rf_meta *meta + #fused_rf_meta = cython.fused(RandomForestRegressorF, RandomForestRegressorD, + # RandomForestClassifierF, RandomForestClassifierD) # # Treelite handling # @@ -157,10 +170,3 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: cdef ModelHandle concatenate_trees( vector[ModelHandle] &treelite_handles) except + - cdef void predict[T, L](cumlHandle& handle, - RandomForestMetaData[T, L] *, - T*, - int, - int, - L*, - int) except + diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index b356e8d901..a811b32518 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -69,13 +69,13 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": RF_params, int) except + - cdef void predict[T, T](cumlHandle& handle, - RandomForestMetaData[T, T] *, - T*, - int, - int, - T*, - int) except + + cdef void predict[T](cumlHandle& handle, + RandomForestMetaData[T, T] *, + T*, + int, + int, + T*, + int) except + cdef RF_metrics score(cumlHandle& handle, RandomForestMetaData[float, float]*, @@ -417,21 +417,21 @@ class RandomForestRegressor(BaseRandomForestModel): self.rf_forest64 if self.dtype == np.float32: predict(handle_[0], - rf_forest, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) + rf_forest, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) elif self.dtype == np.float64: predict(handle_[0], - rf_forest64, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) + rf_forest64, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) else: raise TypeError("supports only np.float32 and np.float64 input," " but input of type '%s' passed." From 6e64046fdf27ad76c34a1ec3bccfd6301411dfbd Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Thu, 21 May 2020 12:14:42 -0500 Subject: [PATCH 06/32] updated rf cython code --- python/cuml/ensemble/base.pyx | 243 ++++++++++++++++++ python/cuml/ensemble/randomforest_common.pyx | 237 +++++++++-------- python/cuml/ensemble/randomforest_shared.pxd | 22 -- .../cuml/ensemble/randomforestclassifier.pyx | 178 ++++++++++--- .../cuml/ensemble/randomforestregressor.pyx | 189 +++++++++++--- 5 files changed, 665 insertions(+), 204 deletions(-) create mode 100644 python/cuml/ensemble/base.pyx diff --git a/python/cuml/ensemble/base.pyx b/python/cuml/ensemble/base.pyx new file mode 100644 index 0000000000..1549dfd3cc --- /dev/null +++ b/python/cuml/ensemble/base.pyx @@ -0,0 +1,243 @@ +import ctypes +import cupy as cp +import math +import warnings + +import numpy as np +from cuml import ForestInference +from cuml.fil.fil import TreeliteModel as tl +from cuml.common.handle import Handle +from cuml.common.base import Base +from cuml.common.array import CumlArray + +from cython.operator cimport dereference as deref + +from cuml.ensemble.randomforest_shared cimport * +from cuml.common import input_to_cuml_array, rmm_cupy_ary + + + +class BaseRandomForestModel(Base): + variables = ['n_estimators', 'max_depth', 'handle', + 'max_features', 'n_bins', + 'split_algo', 'split_criterion', 'min_rows_per_node', + 'min_impurity_decrease', + 'bootstrap', 'bootstrap_features', + 'verbose', 'rows_sample', + 'max_leaves', 'quantile_per_tree'] + + def _create_model(self, model, seed, split_criterion, + n_streams, n_estimators=100, + max_depth=16, handle=None, max_features='auto', + n_bins=8, split_algo=1, bootstrap=True, + bootstrap_features=False, + verbose=False, min_rows_per_node=2, + rows_sample=1.0, max_leaves=-1, + accuracy_metric=None, dtype=None, + output_type=None, min_samples_leaf=None, + min_weight_fraction_leaf=None, n_jobs=None, + max_leaf_nodes=None, min_impurity_decrease=0.0, + min_impurity_split=None, oob_score=None, + random_state=None, warm_start=None, class_weight=None, + quantile_per_tree=False, criterion=None): + + if accuracy_metric: + model.variables.append('accuracy_metric') + sklearn_params = {"criterion": criterion, + "min_samples_leaf": min_samples_leaf, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + "max_leaf_nodes": max_leaf_nodes, + "min_impurity_split": min_impurity_split, + "oob_score": oob_score, "n_jobs": n_jobs, + "random_state": random_state, + "warm_start": warm_start, + "class_weight": class_weight} + + for key, vals in sklearn_params.items(): + if vals is not None: + raise TypeError(" The Scikit-learn variable ", key, + " is not supported in cuML," + " please read the cuML documentation for" + " more information") + + if handle is None: + handle = Handle(n_streams) + + super(model, self).__init__(handle=handle, + verbose=verbose, + output_type=output_type) + if max_depth < 0: + raise ValueError("Must specify max_depth >0 ") + + self.split_algo = split_algo + criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, + '3': MAE, '4': CRITERION_END} + if str(split_criterion) not in criterion_dict.keys(): + warnings.warn("The split criterion chosen was not present" + " in the list of options accepted by the model" + " and so the CRITERION_END option has been chosen.") + self.split_criterion = CRITERION_END + else: + self.split_criterion = criterion_dict[str(split_criterion)] + + self.min_rows_per_node = min_rows_per_node + self.min_impurity_decrease = min_impurity_decrease + self.bootstrap_features = bootstrap_features + self.rows_sample = rows_sample + self.max_leaves = max_leaves + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.bootstrap = bootstrap + self.verbose = verbose + self.n_bins = n_bins + self.n_cols = None + self.dtype = dtype + self.accuracy_metric = accuracy_metric + self.quantile_per_tree = quantile_per_tree + self.n_streams = handle.getNumInternalStreams() + self.seed = seed + self.model_pbuf_bytes = bytearray() + self.treelite_handle = None + + def _dataset_setup(self, X, y, convert_dtype): + self._set_output_type(X) + + # Reset the old tree data for new fit call + self._reset_forest_data() + + #cdef uintptr_t X_ptr, y_ptr + + X_m, self.n_rows, self.n_cols, self.dtype = \ + input_to_cuml_array(X, check_dtype=[np.float32, np.float64], + order='F') + if self.RF_type == CLASSIFICATION: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, check_dtype=np.int32, + convert_to_dtype=(np.int32 if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + if y_dtype != np.int32: + raise TypeError("The labels `y` need to be of dtype `np.int32`") + unique_labels = rmm_cupy_ary(cp.unique, y_m) + self.num_classes = len(unique_labels) + for i in range(self.num_classes): + if i not in unique_labels: + raise ValueError("The labels need " + "to be consecutive values from " + "0 to the number of unique label values") + else: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, + convert_to_dtype=(self.dtype if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + + if self.dtype == np.float64: + warnings.warn("To use GPU-based prediction, first train using \ + float 32 data to fit the estimator.") + + max_feature_val = self._get_max_feat_val() + if type(self.min_rows_per_node) == float: + self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows) + + return X_m, y_m, max_feature_val + + def _get_params(self, model, deep): + params = dict() + for key in model.variables: + if key in ['handle']: + continue + var_value = getattr(self, key, None) + params[key] = var_value + return params + + def _set_params(self, model, **params): + self.handle.__setstate__(self.n_streams) + self.model_pbuf_bytes = [] + + if not params: + return self + for key, value in params.items(): + if key not in model.variables: + raise ValueError('Invalid parameter for estimator') + else: + setattr(self, key, value) + return self + + +def _check_fil_parameter_validity(depth, algo, fil_sparse_format): + storage_format = _check_fil_sparse_format_value(fil_sparse_format) + if (depth > 16 and (storage_format == 'dense' or + algo == 'tree_reorg' or + algo == 'batch_tree_reorg')): + raise ValueError("While creating a forest with max_depth greater " + "than 16, `fil_sparse_format` should be True. " + "If `fil_sparse_format=False` then the memory" + "consumed while creating the FIL forest is very " + "large and the process will be aborted. In " + "addition, `algo` must be either set to `naive' " + "or `auto` to set 'fil_sparse_format=True`.") + print(" storage_format : ", storage_format) + return storage_format + + +def _check_fil_sparse_format_value(fil_sparse_format): + accepted_vals = [True, False, 'auto'] + if fil_sparse_format == 'auto': + storage_format = fil_sparse_format + elif not fil_sparse_format: + storage_format = 'dense' + elif fil_sparse_format not in accepted_vals: + raise ValueError("The value entered for spares_forest is not " + "supported. Please refer to the documentation " + "to see the accepted values.") + else: + storage_format = 'sparse' + print(" storage_format : ", storage_format) + return storage_format + + +def _obtain_treelite_model(treelite_handle): + """ + Creates a Treelite model using the treelite handle + obtained from the cuML Random Forest model. + + Returns + ---------- + tl_to_fil_model : Treelite version of this model + """ + treelite_model = \ + tl.from_treelite_model_handle(treelite_handle) + return treelite_model + + +def _obtain_fil_model(treelite_handle, depth, + output_class=True, + threshold=0.5, algo='auto', + fil_sparse_format='auto'): + """ + Creates a Forest Inference (FIL) model using the treelite + handle obtained from the cuML Random Forest model. + + Returns + ---------- + fil_model : + A Forest Inference model which can be used to perform + inferencing on the random forest model. + """ + print(" treelite handle in obt fil : ", treelite_handle) + storage_format = \ + _check_fil_parameter_validity(depth=depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_format) + + return tl_to_fil_model diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index ee8dcb78e3..e9225847ce 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -33,58 +33,7 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary cimport cython - -cdef class BaseRandomForestModel_impl(): - cpdef creat_meta(self, X_dtype a, y_dtype b): - meta = \ - new RandomForestMetaData[cython.typeof(a), cython.typeof(b)]() - cdef RandomForestMetaData[float, int] *rf_class - cdef RandomForestMetaData[float, float] *rf_reg - if cython.typeof(b) == cython.int: - rf_class = self.rf_forest - meta = rf_class - else: - rf_reg = self.rf_forest - meta = rf_reg - - def _obtain_treelite_handle(self): - if self.treelite_handle: - return self.treelite_handle - cdef ModelHandle cuml_model_ptr = NULL - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - cdef cython.float a = 10.0 - cdef cython.int b = 5 - base_rf = BaseRandomForestModel_impl() - base_rf.creat_meta(a, b) - if self.RF_type == CLASSIFICATION: - build_treelite_forest( - & cuml_model_ptr, - # self.rf_forest, - meta, - self.n_cols, - self.num_classes, - model_pbuf_vec) - else: - build_treelite_forest( - & cuml_model_ptr, - # self.rf_forest, - meta, - self.n_cols, - self.num_classes, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - - return self.treelite_handle - - +# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff class BaseRandomForestModel(Base): variables = ['n_estimators', 'max_depth', 'handle', 'max_features', 'n_bins', @@ -94,9 +43,6 @@ class BaseRandomForestModel(Base): 'verbose', 'rows_sample', 'max_leaves', 'quantile_per_tree'] - def __init__(self): - self._impl = BaseRandomForestModel_impl() - def _create_model(self, model, seed, split_criterion, n_streams, n_estimators=100, max_depth=16, handle=None, max_features='auto', @@ -170,12 +116,7 @@ class BaseRandomForestModel(Base): self.seed = seed self.model_pbuf_bytes = bytearray() self.treelite_handle = None - # if self.model_type == curfr: - # print have a check for the random forest meta data in init - """ - def _check_rf_meta_data_format(self, task_category): - if task_category == CLASSIFICATION - """ + def _get_max_feat_val(self): if type(self.max_features) == int: return self.max_features/self.n_cols @@ -194,9 +135,67 @@ class BaseRandomForestModel(Base): raise ValueError("Wrong value passed in for max_features" " please read the documentation") + def _get_protobuf_bytes(self): + """ + Returns the self.model_pbuf_bytes. + Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite + models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model + to a treelite protobuf model which is stored in a temporary file. + The protobuf model information is read from the temporary file and + the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite + protobuf model bytes are stored in `self.model_pbuf_bytes`. If either + of information is already present in the model then the respective + step is skipped. + """ + if self.model_pbuf_bytes: + return self.model_pbuf_bytes + elif self.treelite_handle: + fit_mod_ptr = self.treelite_handle + else: + fit_mod_ptr = self._obtain_treelite_handle() + cdef uintptr_t model_ptr = fit_mod_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self.model_pbuf_bytes + def _obtain_treelite_handle(self): - - return self._impl._obtain_treelite_handle + if self.treelite_handle: + print(" treelite handle in obt : ", self.treelite_handle) + return self.treelite_handle # Use cached version + cdef ModelHandle cuml_model_ptr = NULL + cdef unsigned char[::1] model_pbuf_mv + cdef vector[unsigned char] model_pbuf_vec + if self.model_pbuf_bytes: + model_pbuf_mv = self.model_pbuf_bytes + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + else: + model_pbuf_vec = bytearray() + if self.RF_type == CLASSIFICATION: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + self.num_classes, + model_pbuf_vec) + else: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + REGRESSION_MODEL, + model_pbuf_vec) + + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + return self.treelite_handle def _dataset_setup(self, X, y, convert_dtype): self._set_output_type(X) @@ -204,8 +203,6 @@ class BaseRandomForestModel(Base): # Reset the old tree data for new fit call self._reset_forest_data() - #cdef uintptr_t X_ptr, y_ptr - X_m, self.n_rows, self.n_cols, self.dtype = \ input_to_cuml_array(X, check_dtype=[np.float32, np.float64], order='F') @@ -241,48 +238,70 @@ class BaseRandomForestModel(Base): return X_m, y_m, max_feature_val - def _get_protobuf_bytes(self): - """ - Returns the self.model_pbuf_bytes. - Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite - models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. - """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes - elif self.treelite_handle: - fit_mod_ptr = self.treelite_handle - else: - fit_mod_ptr = self._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes + """ + def _predict_model_on_gpu(self, X, + algo, + convert_dtype, + fil_sparse_format, + output_class=False, + threshold=0.5, + predict_proba=False): + out_type = self._get_output_type(X) + cdef ModelHandle cuml_model_ptr = NULL + _, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='F', + check_cols=self.n_cols) + if dtype == np.float64 and not convert_dtype: + raise TypeError("GPU based predict only accepts np.float32 data. \ + Please set convert_dtype=True to convert the test \ + data to the same dtype as the data used to train, \ + ie. np.float32. If you would like to use test \ + data of dtype=np.float64 please set \ + predict_model='CPU' to use the CPU implementation \ + of predict.") + + self._obtain_treelite_handle() + storage_type = \ + _check_fil_parameter_validity(depth=self.max_depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(self.treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_type) + + preds = tl_to_fil_model.predict(X, output_type=out_type, + predict_proba=predict_proba) + tl.free_treelite_model(self.treelite_handle) + return preds + """ + def _tl_model_handles(self, model_bytes): cdef ModelHandle cuml_model_ptr = NULL - meta = self.rf_forest - task_category = CLASSIFICATION_MODEL - build_treelite_forest(& cuml_model_ptr, - meta, - self.n_cols, - task_category, - model_bytes) + if self.RF_type == CLASSIFICATION: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + self.num_classes, + model_bytes) + else: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + 1, + model_bytes) mod_handle = cuml_model_ptr return ctypes.c_void_p(mod_handle).value + def _concatenate_treelite_handle(self, treelite_handle): cdef ModelHandle concat_model_handle = NULL cdef vector[ModelHandle] *model_handles \ @@ -292,11 +311,11 @@ class BaseRandomForestModel(Base): mod_ptr = i model_handles.push_back(( mod_ptr)) - + print(" run the concat c++ func") concat_model_handle = concatenate_trees(deref(model_handles)) - cdef uintptr_t concat_model_ptr = concat_model_handle self.treelite_handle = concat_model_ptr + print(" treelite handle in concat : ", self.treelite_handle) cdef vector[unsigned char] pbuf_mod_info = \ save_model( concat_model_ptr) cdef unsigned char[::1] pbuf_mod_view = \ @@ -304,7 +323,8 @@ class BaseRandomForestModel(Base): self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) return self - def _predict_model_on_gpu(self, model, X, algo, convert_dtype, + + def _predict_model_on_gpu(self, X, algo, convert_dtype, fil_sparse_format, threshold=0.5, output_class=False, predict_proba=False): out_type = self._get_output_type(X) @@ -339,8 +359,9 @@ class BaseRandomForestModel(Base): preds = tl_to_fil_model.predict(X, output_type=out_type, predict_proba=predict_proba) tl.free_treelite_model(self.treelite_handle) + self.treelite_handle = None return preds - + def _get_params(self, model, deep): params = dict() for key in model.variables: @@ -363,12 +384,6 @@ class BaseRandomForestModel(Base): setattr(self, key, value) return self - def _get_protobuf_bytes_common(self, model): - fit_mod_ptr = model._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - model_protobuf_bytes = save_model( model_ptr) - return model_protobuf_bytes - def _check_fil_parameter_validity(depth, algo, fil_sparse_format): storage_format = _check_fil_sparse_format_value(fil_sparse_format) @@ -397,7 +412,6 @@ def _check_fil_sparse_format_value(fil_sparse_format): "to see the accepted values.") else: storage_format = 'sparse' - return storage_format @@ -429,7 +443,6 @@ def _obtain_fil_model(treelite_handle, depth, A Forest Inference model which can be used to perform inferencing on the random forest model. """ - storage_format = \ _check_fil_parameter_validity(depth=depth, fil_sparse_format=fil_sparse_format, diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index dc7b197bd2..05abfa7135 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -111,28 +111,6 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: ctypedef TreeMetaDataNode[T, L]* trees RF_params rf_params - - ctypedef fused fused_rf_meta: - RandomForestMetaData[float, float] - #RandomForestMetaData[double, double] - RandomForestMetaData[float, int] - #RandomForestMetaData[double, int] - - ctypedef fused X_dtype: - cython.float - cython.double - - ctypedef fused y_dtype: - cython.int - cython.float - cython.double - - cpdef fused_rf_meta *meta - - cdef RandomForestMetaData[T, L]* create_meta[T, L](T a, L b) - #cdef fused_rf_meta *meta - #fused_rf_meta = cython.fused(RandomForestRegressorF, RandomForestRegressorD, - # RandomForestClassifierF, RandomForestClassifierD) # # Treelite handling # diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 5a98251932..e6de676912 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -33,9 +33,10 @@ from libcpp.vector cimport vector from libc.stdint cimport uintptr_t from libc.stdlib cimport calloc, malloc, free +from cython.operator cimport dereference as deref + from cuml import ForestInference from cuml.common.array import CumlArray -from cuml.common.base import Base from cuml.common.handle import Handle from cuml.ensemble.randomforest_common import BaseRandomForestModel @@ -75,13 +76,21 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": RF_params, int) except + - cdef void predict[T, L](cumlHandle& handle, - RandomForestMetaData[T, L] *, - T*, - int, - int, - L*, - int) except + + cdef void predict(cumlHandle& handle, + RandomForestMetaData[float, int] *, + float*, + int, + int, + int*, + bool) except + + + cdef void predict(cumlHandle& handle, + RandomForestMetaData[double, int]*, + double*, + int, + int, + int*, + bool) except + cdef void predictGetAll(cumlHandle& handle, RandomForestMetaData[float, int] *, @@ -299,6 +308,62 @@ class RandomForestClassifier(BaseRandomForestModel): free( self.rf_forest64) + """ + def _obtain_treelite_handle(self): + cdef ModelHandle cuml_model_ptr = NULL + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest + if self.num_classes > 2: + raise NotImplementedError("Pickling for multi-class " + "classification models is currently not " + "implemented. Please check cuml issue " + "#1679 for more information.") + cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes + cdef vector[unsigned char] model_pbuf_vec + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + if self.treelite_handle is None: + build_treelite_forest( + & cuml_model_ptr, + rf_forest, + self.n_cols, + self.num_classes, + model_pbuf_vec) + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + return self.treelite_handle + + def _get_protobuf_bytes(self): + + Returns the self.model_pbuf_bytes. + Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite + models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model + to a treelite protobuf model which is stored in a temporary file. + The protobuf model information is read from the temporary file and + the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite + protobuf model bytes are stored in `self.model_pbuf_bytes`. If either + of information is already present in the model then the respective + step is skipped. + + if self.model_pbuf_bytes: + return self.model_pbuf_bytes + elif self.treelite_handle: + fit_mod_ptr = self.treelite_handle + else: + fit_mod_ptr = self._obtain_treelite_handle() + cdef uintptr_t model_ptr = fit_mod_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self.model_pbuf_bytes + """ + def convert_to_treelite_model(self): """ Converts the cuML RF model to a Treelite model @@ -356,7 +421,10 @@ class RandomForestClassifier(BaseRandomForestModel): A Forest Inference model which can be used to perform inferencing on the random forest model. """ + print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%") + print(" self.treelite handle in conv to fil : ", self.treelite_handle) treelite_handle = self._obtain_treelite_handle() + print(" self.treelite_handle in convert_to_fil : ", self.treelite_handle) return _obtain_fil_model(treelite_handle=treelite_handle, depth=self.max_depth, output_class=output_class, @@ -367,20 +435,28 @@ class RandomForestClassifier(BaseRandomForestModel): """ TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. + + + def _concatenate_treelite_handle(self, treelite_handle): + cdef ModelHandle concat_model_handle = NULL + cdef vector[ModelHandle] *model_handles \ + = new vector[ModelHandle]() + cdef uintptr_t mod_ptr + for i in treelite_handle: + mod_ptr = i + model_handles.push_back(( + mod_ptr)) + + concat_model_handle = concatenate_trees(deref(model_handles)) + cdef uintptr_t concat_model_ptr = concat_model_handle + self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( concat_model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self """ - def _tl_model_handles(self, model_bytes): - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - task_category = CLASSIFICATION_MODEL - build_treelite_forest(& cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = cuml_model_ptr - - return ctypes.c_void_p(mod_handle).value def fit(self, X, y, convert_dtype=False): """ @@ -472,6 +548,54 @@ class RandomForestClassifier(BaseRandomForestModel): del y_m return self """ + def _predict_model_on_gpu(self, X, output_class, + threshold, algo, + convert_dtype, + fil_sparse_format, predict_proba): + out_type = self._get_output_type(X) + cdef ModelHandle cuml_model_ptr = NULL + _, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='F', + check_cols=self.n_cols) + + if dtype == np.float64 and not convert_dtype: + raise TypeError("GPU based predict only accepts np.float32 data. \ + Please set convert_dtype=True to convert the test \ + data to the same dtype as the data used to train, \ + ie. np.float32. If you would like to use test \ + data of dtype=np.float64 please set \ + predict_model='CPU' to use the CPU implementation \ + of predict.") + + cdef RandomForestMetaData[float, int] *rf_forest = \ + self.rf_forest + + build_treelite_forest(& cuml_model_ptr, + rf_forest, + n_cols, + self.num_classes, + self.model_pbuf_bytes) + mod_ptr = cuml_model_ptr + treelite_handle = ctypes.c_void_p(mod_ptr).value + + storage_type = \ + _check_fil_parameter_validity(depth=self.max_depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_type) + + preds = tl_to_fil_model.predict(X, output_type=out_type, + predict_proba=predict_proba) + tl.free_treelite_model(treelite_handle) + return preds + """ def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr @@ -519,12 +643,11 @@ class RandomForestClassifier(BaseRandomForestModel): # synchronous w/o a stream del(X_m) return preds.to_output(out_type) - """ def predict(self, X, predict_model="GPU", output_class=True, threshold=0.5, algo='auto', - num_classes=2, convert_dtype=True, + convert_dtype=True, fil_sparse_format='auto'): """ Predicts the labels for X. @@ -599,8 +722,7 @@ class RandomForestClassifier(BaseRandomForestModel): else: preds = \ - self._predict_model_on_gpu(model=RandomForestClassifier, - X=X, output_class=output_class, + self._predict_model_on_gpu(X=X, output_class=output_class, threshold=threshold, algo=algo, convert_dtype=convert_dtype, @@ -671,7 +793,7 @@ class RandomForestClassifier(BaseRandomForestModel): def predict_proba(self, X, output_class=True, threshold=0.5, algo='auto', - num_classes=2, convert_dtype=True, + convert_dtype=True, fil_sparse_format='auto'): """ Predicts class probabilites for X. This function uses the GPU @@ -745,8 +867,7 @@ class RandomForestClassifier(BaseRandomForestModel): "implemented. Please check cuml issue " "#1679 for more information.") preds_proba = \ - self._predict_model_on_gpu(model=RandomForestClassifier, - X=X, output_class=output_class, + self._predict_model_on_gpu(X, output_class=output_class, threshold=threshold, algo=algo, convert_dtype=convert_dtype, @@ -821,7 +942,6 @@ class RandomForestClassifier(BaseRandomForestModel): y_ptr = y_m.ptr preds = self.predict(X, output_class=True, threshold=threshold, algo=algo, - num_classes=num_classes, convert_dtype=convert_dtype, predict_model=predict_model, fil_sparse_format=fil_sparse_format) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index a811b32518..2d3a9a4af0 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -21,6 +21,7 @@ import ctypes import cudf +import math import numpy as np import warnings @@ -31,7 +32,6 @@ from libc.stdlib cimport calloc, malloc, free from cuml import ForestInference from cuml.common.array import CumlArray -from cuml.common.base import Base from cuml.common.handle import Handle from cuml.ensemble.randomforest_common import BaseRandomForestModel from cuml.common.handle cimport cumlHandle @@ -68,14 +68,22 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": double*, RF_params, int) except + - - cdef void predict[T](cumlHandle& handle, - RandomForestMetaData[T, T] *, - T*, - int, - int, - T*, - int) except + + + cdef void predict(cumlHandle& handle, + RandomForestMetaData[float, float] *, + float*, + int, + int, + float*, + int) except + + + cdef void predict(cumlHandle& handle, + RandomForestMetaData[double, double]*, + double*, + int, + int, + double*, + int) except + cdef RF_metrics score(cumlHandle& handle, RandomForestMetaData[float, float]*, @@ -281,9 +289,6 @@ class RandomForestRegressor(BaseRandomForestModel): self.rf_forest) free( self.rf_forest64) - self.treelite_handle = None - self.model_pbuf_bytes = bytearray() - def convert_to_treelite_model(self): """ @@ -297,24 +302,80 @@ class RandomForestRegressor(BaseRandomForestModel): return _obtain_treelite_model(handle) + def convert_to_fil_model(self, output_class=False, + algo='auto', + fil_sparse_format='auto'): + """ + Create a Forest Inference (FIL) model from the trained cuML + Random Forest model. + Parameters + ---------- + output_class : boolean (default = True) + This is optional and required only while performing the + predict operation on the GPU. + If true, return a 1 or 0 depending on whether the raw + prediction exceeds the threshold. If False, just return + the raw prediction. + algo : string (default = 'auto') + This is optional and required only while performing the + predict operation on the GPU. + 'naive' - simple inference using shared memory + 'tree_reorg' - similar to naive but trees rearranged to be more + coalescing-friendly + 'batch_tree_reorg' - similar to tree_reorg but predicting + multiple rows per thread block + `auto` - choose the algorithm automatically. Currently + 'batch_tree_reorg' is used for dense storage + and 'naive' for sparse storage + fil_sparse_format : boolean or string (default = auto) + This variable is used to choose the type of forest that will be + created in the Forest Inference Library. It is not required + while using predict_model='CPU'. + 'auto' - choose the storage type automatically + (currently True is chosen by auto) + False - create a dense forest + True - create a sparse forest, requires algo='naive' + or algo='auto' + Returns + ---------- + fil_model : + A Forest Inference model which can be used to perform + inferencing on the random forest model. + """ + treelite_handle = self._obtain_treelite_handle() + return _obtain_fil_model(treelite_handle=treelite_handle, + depth=self.max_depth, + output_class=output_class, + algo=algo, + fil_sparse_format=fil_sparse_format) + """ TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ - def _tl_model_handles(self, model_bytes): - task_category = REGRESSION_MODEL - cdef ModelHandle tl_model_ptr = NULL - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - build_treelite_forest(& tl_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = tl_model_ptr - - return ctypes.c_void_p(mod_handle).value + """ + def _concatenate_treelite_handle(self, treelite_handle): + cdef ModelHandle concat_model_handle = NULL + cdef vector[ModelHandle] *model_handles \ + = new vector[ModelHandle]() + cdef uintptr_t mod_ptr + for i in treelite_handle: + mod_ptr = i + model_handles.push_back(( + mod_ptr)) + + concat_model_handle = concatenate_trees(deref(model_handles)) + + cdef uintptr_t concat_model_ptr = concat_model_handle + self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( concat_model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self + """ def fit(self, X, y, convert_dtype=False): """ @@ -393,6 +454,53 @@ class RandomForestRegressor(BaseRandomForestModel): del X_m del y_m return self + + """ + def _predict_model_on_gpu(self, X, algo, convert_dtype, + fil_sparse_format): + out_type = self._get_output_type(X) + cdef ModelHandle cuml_model_ptr = NULL + _, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='F', + check_cols=self.n_cols) + + if dtype == np.float64 and not convert_dtype: + raise TypeError("GPU based predict only accepts np.float32 data. \ + Please set convert_dtype=True to convert the test \ + data to the same dtype as the data used to train, \ + ie. np.float32. If you would like to use test \ + data of dtype=np.float64 please set \ + predict_model='CPU' to use the CPU implementation \ + of predict.") + + cdef RandomForestMetaData[float, float] *rf_forest = \ + self.rf_forest + + task_category = REGRESSION_MODEL + build_treelite_forest(& cuml_model_ptr, + rf_forest, + n_cols, + task_category, + self.model_pbuf_bytes) + mod_ptr = cuml_model_ptr + treelite_handle = ctypes.c_void_p(mod_ptr).value + + storage_type = \ + _check_fil_parameter_validity(depth=self.max_depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_from_randomforest(treelite_handle, + output_class=False, + algo=algo, + storage_type=storage_type) + + preds = tl_to_fil_model.predict(X, out_type) + tl.free_treelite_model(treelite_handle) + return preds + """ def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) @@ -404,7 +512,7 @@ class RandomForestRegressor(BaseRandomForestModel): check_cols=self.n_cols) X_ptr = X_m.ptr - preds = CumlArray.zeros(n_rows, dtype=np.int32) + preds = CumlArray.zeros(n_rows, dtype=dtype) cdef uintptr_t preds_ptr = preds.ptr cdef cumlHandle* handle_ =\ @@ -417,23 +525,23 @@ class RandomForestRegressor(BaseRandomForestModel): self.rf_forest64 if self.dtype == np.float32: predict(handle_[0], - rf_forest, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) + rf_forest, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) elif self.dtype == np.float64: predict(handle_[0], - rf_forest64, - X_ptr, - n_rows, - n_cols, - preds_ptr, - self.verbosity) + rf_forest64, + X_ptr, + n_rows, + n_cols, + preds_ptr, + self.verbosity) else: - raise TypeError("supports only np.float32 and np.float64 input," + raise TypeError("supports only float32 and float64 input," " but input of type '%s' passed." % (str(self.dtype))) @@ -501,8 +609,7 @@ class RandomForestRegressor(BaseRandomForestModel): setting predict_model = 'CPU'") else: - preds = self._predict_model_on_gpu(model=RandomForestRegressor, - X=X, + preds = self._predict_model_on_gpu(X=X, algo=algo, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format) From 89512dae37f00ce2c52a181b49e90477fbcac012 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 06:54:34 -0500 Subject: [PATCH 07/32] updated docs and code: --- .../dask/ensemble/randomforestclassifier.py | 1 - python/cuml/ensemble/randomforest_common.pyx | 431 ++++++++++++++++++ .../cuml/ensemble/randomforestclassifier.pyx | 292 +----------- .../cuml/ensemble/randomforestregressor.pyx | 263 +---------- 4 files changed, 464 insertions(+), 523 deletions(-) create mode 100644 python/cuml/ensemble/randomforest_common.pyx diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index d74592e484..27973f219e 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -298,7 +298,6 @@ def predict(self, X, output_class=True, algo='auto', threshold=0.5, self.predict_using_fil(X, output_class=output_class, algo=algo, threshold=threshold, - num_classes=self.num_classes, convert_dtype=convert_dtype, predict_model="GPU", fil_sparse_format=fil_sparse_format, diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx new file mode 100644 index 0000000000..0801370f9e --- /dev/null +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -0,0 +1,431 @@ +# +# Copyright (c) 2020, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import ctypes +import cupy as cp +import math +import warnings + +import numpy as np +from cuml import ForestInference +from cuml.fil.fil import TreeliteModel +from cuml.common.handle import Handle +from cuml.common.base import Base +from cuml.common.array import CumlArray + +from cython.operator cimport dereference as deref + +from cuml.ensemble.randomforest_shared cimport * +from cuml.common import input_to_cuml_array, rmm_cupy_ary + +cimport cython + +# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff +class BaseRandomForestModel(Base): + variables = ['n_estimators', 'max_depth', 'handle', + 'max_features', 'n_bins', + 'split_algo', 'split_criterion', 'min_rows_per_node', + 'min_impurity_decrease', + 'bootstrap', 'bootstrap_features', + 'verbose', 'rows_sample', + 'max_leaves', 'quantile_per_tree'] + + def _create_model(self, model, seed, split_criterion, + n_streams, n_estimators=100, + max_depth=16, handle=None, max_features='auto', + n_bins=8, split_algo=1, bootstrap=True, + bootstrap_features=False, + verbose=False, min_rows_per_node=2, + rows_sample=1.0, max_leaves=-1, + accuracy_metric=None, dtype=None, + output_type=None, min_samples_leaf=None, + min_weight_fraction_leaf=None, n_jobs=None, + max_leaf_nodes=None, min_impurity_decrease=0.0, + min_impurity_split=None, oob_score=None, + random_state=None, warm_start=None, class_weight=None, + quantile_per_tree=False, criterion=None): + + if accuracy_metric: + model.variables.append('accuracy_metric') + sklearn_params = {"criterion": criterion, + "min_samples_leaf": min_samples_leaf, + "min_weight_fraction_leaf": min_weight_fraction_leaf, + "max_leaf_nodes": max_leaf_nodes, + "min_impurity_split": min_impurity_split, + "oob_score": oob_score, "n_jobs": n_jobs, + "random_state": random_state, + "warm_start": warm_start, + "class_weight": class_weight} + + for key, vals in sklearn_params.items(): + if vals is not None: + raise TypeError(" The Scikit-learn variable ", key, + " is not supported in cuML," + " please read the cuML documentation for" + " more information") + + if handle is None: + handle = Handle(n_streams) + + super(model, self).__init__(handle=handle, + verbose=verbose, + output_type=output_type) + if max_depth < 0: + raise ValueError("Must specify max_depth >0 ") + + self.split_algo = split_algo + criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, + '3': MAE, '4': CRITERION_END} + if str(split_criterion) not in criterion_dict.keys(): + warnings.warn("The split criterion chosen was not present" + " in the list of options accepted by the model" + " and so the CRITERION_END option has been chosen.") + self.split_criterion = CRITERION_END + else: + self.split_criterion = criterion_dict[str(split_criterion)] + + self.min_rows_per_node = min_rows_per_node + self.min_impurity_decrease = min_impurity_decrease + self.bootstrap_features = bootstrap_features + self.rows_sample = rows_sample + self.max_leaves = max_leaves + self.n_estimators = n_estimators + self.max_depth = max_depth + self.max_features = max_features + self.bootstrap = bootstrap + self.verbose = verbose + self.n_bins = n_bins + self.n_cols = None + self.dtype = dtype + self.accuracy_metric = accuracy_metric + self.quantile_per_tree = quantile_per_tree + self.n_streams = handle.getNumInternalStreams() + self.seed = seed + self.rf_forest = 0 + self.rf_forest64 = 0 + self.model_pbuf_bytes = bytearray() + self.treelite_handle = None + + def _get_max_feat_val(self): + if type(self.max_features) == int: + return self.max_features/self.n_cols + elif type(self.max_features) == float: + return self.max_features + elif self.max_features == 'sqrt': + return 1/np.sqrt(self.n_cols) + elif self.max_features == 'log2': + return math.log2(self.n_cols)/self.n_cols + elif self.max_features == 'auto': + if self.RF_type == CLASSIFICATION: + return 1/np.sqrt(self.n_cols) + else: + return 1.0 + else: + raise ValueError("Wrong value passed in for max_features" + " please read the documentation") + + def _get_protobuf_bytes(self): + """ + Returns the self.model_pbuf_bytes. + Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite + models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model + to a treelite protobuf model which is stored in a temporary file. + The protobuf model information is read from the temporary file and + the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite + protobuf model bytes are stored in `self.model_pbuf_bytes`. If either + of information is already present in the model then the respective + step is skipped. + """ + if self.model_pbuf_bytes: + return self.model_pbuf_bytes + elif self.treelite_handle: + fit_mod_ptr = self.treelite_handle + else: + fit_mod_ptr = self._obtain_treelite_handle() + cdef uintptr_t model_ptr = fit_mod_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + return self.model_pbuf_bytes + + + def _obtain_treelite_handle(self): + if self.treelite_handle: + print(" treelite handle in obt : ", self.treelite_handle) + return self.treelite_handle # Use cached version + cdef ModelHandle cuml_model_ptr = NULL + cdef unsigned char[::1] model_pbuf_mv + cdef vector[unsigned char] model_pbuf_vec + if self.model_pbuf_bytes: + model_pbuf_mv = self.model_pbuf_bytes + with cython.boundscheck(False): + model_pbuf_vec.assign(& model_pbuf_mv[0], + & model_pbuf_mv[model_pbuf_mv.shape[0]]) + else: + model_pbuf_vec = bytearray() + if self.RF_type == CLASSIFICATION: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + self.num_classes, + model_pbuf_vec) + else: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + REGRESSION_MODEL, + model_pbuf_vec) + + mod_ptr = cuml_model_ptr + self.treelite_handle = ctypes.c_void_p(mod_ptr).value + return self.treelite_handle + + + def _dataset_setup(self, X, y, convert_dtype): + self._set_output_type(X) + + # Reset the old tree data for new fit call + self._reset_forest_data() + + X_m, self.n_rows, self.n_cols, self.dtype = \ + input_to_cuml_array(X, check_dtype=[np.float32, np.float64], + order='F') + if self.n_bins > self.n_rows: + raise ValueError("The number of bins,`n_bins` can not be greater" + " than the number of samples used for training.") + if self.RF_type == CLASSIFICATION: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, check_dtype=np.int32, + convert_to_dtype=(np.int32 if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + if y_dtype != np.int32: + raise TypeError("The labels `y` need to be of dtype `np.int32`") + unique_labels = rmm_cupy_ary(cp.unique, y_m) + self.num_classes = len(unique_labels) + for i in range(self.num_classes): + if i not in unique_labels: + raise ValueError("The labels need " + "to be consecutive values from " + "0 to the number of unique label values") + else: + y_m, _, _, y_dtype = \ + input_to_cuml_array(y, + convert_to_dtype=(self.dtype if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) + + if self.dtype == np.float64: + warnings.warn("To use GPU-based prediction, first train using \ + float 32 data to fit the estimator.") + + max_feature_val = self._get_max_feat_val() + if type(self.min_rows_per_node) == float: + self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows) + del X + del y + return X_m, y_m, max_feature_val + + + def _tl_model_handles(self, model_bytes): + cdef ModelHandle cuml_model_ptr = NULL + if self.RF_type == CLASSIFICATION: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + self.num_classes, + model_bytes) + else: + build_treelite_forest( + & cuml_model_ptr, + self.rf_forest, + self.n_cols, + REGRESSION_MODEL, + model_bytes) + mod_handle = cuml_model_ptr + + return ctypes.c_void_p(mod_handle).value + + + def _concatenate_treelite_handle(self, treelite_handle): + cdef ModelHandle concat_model_handle = NULL + cdef vector[ModelHandle] *model_handles \ + = new vector[ModelHandle]() + cdef uintptr_t mod_ptr + for i in treelite_handle: + mod_ptr = i + model_handles.push_back(( + mod_ptr)) + + self._reset_forest_data() + concat_model_handle = concatenate_trees(deref(model_handles)) + cdef uintptr_t concat_model_ptr = concat_model_handle + self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ + save_model( concat_model_ptr) + cdef unsigned char[::1] pbuf_mod_view = \ + pbuf_mod_info.data() + self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) + + # Fix up some instance variables that should match the new TL model + tl_model = TreeliteModel.from_treelite_model_handle( + self.treelite_handle, + take_handle_ownership=False) + self.n_cols = tl_model.num_features + self.n_estimators = tl_model.num_trees + + return self + + + def _predict_model_on_gpu(self, X, algo, convert_dtype, + fil_sparse_format, threshold=0.5, + output_class=False, predict_proba=False): + out_type = self._get_output_type(X) + cdef ModelHandle cuml_model_ptr = NULL + _, n_rows, n_cols, dtype = \ + input_to_cuml_array(X, order='F', + check_cols=self.n_cols) + + if dtype == np.float64 and not convert_dtype: + raise TypeError("GPU based predict only accepts np.float32 data. \ + Please set convert_dtype=True to convert the test \ + data to the same dtype as the data used to train, \ + ie. np.float32. If you would like to use test \ + data of dtype=np.float64 please set \ + predict_model='CPU' to use the CPU implementation \ + of predict.") + + treelite_handle = self._obtain_treelite_handle() + + storage_type = \ + _check_fil_parameter_validity(depth=self.max_depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_using_treelite_handle(treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_type) + + preds = tl_to_fil_model.predict(X, output_type=out_type, + predict_proba=predict_proba) + return preds + + def _get_params(self, model, deep): + params = dict() + for key in model.variables: + if key in ['handle']: + continue + var_value = getattr(self, key, None) + params[key] = var_value + return params + + def _set_params(self, model, **params): + self.handle.__setstate__(self.n_streams) + self.model_pbuf_bytes = [] + + if not params: + return self + for key, value in params.items(): + if key not in model.variables: + raise ValueError('Invalid parameter for estimator') + else: + setattr(self, key, value) + return self + + +def _check_fil_parameter_validity(depth, algo, fil_sparse_format): + storage_format = _check_fil_sparse_format_value(fil_sparse_format) + if (depth > 16 and (storage_format == 'dense' or + algo == 'tree_reorg' or + algo == 'batch_tree_reorg')): + raise ValueError("While creating a forest with max_depth greater " + "than 16, `fil_sparse_format` should be True. " + "If `fil_sparse_format=False` then the memory" + "consumed while creating the FIL forest is very " + "large and the process will be aborted. In " + "addition, `algo` must be either set to `naive' " + "or `auto` to set 'fil_sparse_format=True`.") + return storage_format + + +def _check_fil_sparse_format_value(fil_sparse_format): + accepted_vals = [True, False, 'auto'] + if fil_sparse_format == 'auto': + storage_format = fil_sparse_format + elif not fil_sparse_format: + storage_format = 'dense' + elif fil_sparse_format not in accepted_vals: + raise ValueError("The value entered for spares_forest is not " + "supported. Please refer to the documentation " + "to see the accepted values.") + else: + storage_format = 'sparse' + return storage_format + + +def _obtain_treelite_model(treelite_handle): + """ + Creates a Treelite model using the treelite handle + obtained from the cuML Random Forest model. + + Returns + ---------- + tl_to_fil_model : Treelite version of this model + """ + treelite_model = \ + TreeliteModel.from_treelite_model_handle(treelite_handle) + return treelite_model + + +def _obtain_fil_model(treelite_handle, depth, + output_class=True, + threshold=0.5, algo='auto', + fil_sparse_format='auto'): + """ + Creates a Forest Inference (FIL) model using the treelite + handle obtained from the cuML Random Forest model. + + Returns + ---------- + fil_model : + A Forest Inference model which can be used to perform + inferencing on the random forest model. + """ + storage_format = \ + _check_fil_parameter_validity(depth=depth, + fil_sparse_format=fil_sparse_format, + algo=algo) + + fil_model = ForestInference() + tl_to_fil_model = \ + fil_model.load_using_treelite_handle(treelite_handle, + output_class=output_class, + threshold=threshold, + algo=algo, + storage_type=storage_format) + + return tl_to_fil_model diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index cc3a915edc..ed23fc4ce4 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -41,6 +41,8 @@ from cuml.common.handle import Handle from cuml.common.handle cimport cumlHandle from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model +from cuml.ensemble.randomforest_common import BaseRandomForestModel + from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel from cuml.common import input_to_cuml_array, rmm_cupy_ary @@ -123,7 +125,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": bool) except + -class RandomForestClassifier(Base): +class RandomForestClassifier(BaseRandomForestModel): """ Implements a Random Forest classifier model which fits multiple decision tree classifiers in an ensemble. @@ -231,81 +233,20 @@ class RandomForestClassifier(Base): 'rows_sample', 'max_leaves', 'quantile_per_tree'] - def __init__(self, n_estimators=100, max_depth=16, handle=None, - max_features='auto', n_bins=8, n_streams=8, - split_algo=1, split_criterion=0, min_rows_per_node=2, - bootstrap=True, bootstrap_features=False, - type_model="classifier", verbose=False, - rows_sample=1.0, max_leaves=-1, quantile_per_tree=False, - output_type=None, criterion=None, dtype=None, - min_samples_leaf=None, min_weight_fraction_leaf=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, n_jobs=None, - random_state=None, warm_start=None, class_weight=None, - seed=None): - sklearn_params = {"criterion": criterion, - "min_samples_leaf": min_samples_leaf, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "max_leaf_nodes": max_leaf_nodes, - "min_impurity_split": min_impurity_split, - "oob_score": oob_score, "n_jobs": n_jobs, - "random_state": random_state, - "warm_start": warm_start, - "class_weight": class_weight} - - for key, vals in sklearn_params.items(): - if vals is not None: - raise TypeError("The Scikit-learn variable", key, - " is not supported in cuML," - " please read the cuML documentation for" - " more information") - - if max_depth < 0: - raise ValueError("Must specify max_depth >0") - - if handle is None: - handle = Handle(n_streams) - - super(RandomForestClassifier, self).__init__(handle=handle, - verbose=verbose, - output_type=output_type) - - self.split_algo = split_algo - criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} - if str(split_criterion) not in criterion_dict.keys(): - warnings.warn("The split criterion chosen was not present" - " in the list of options accepted by the model" - " and so the CRITERION_END option has been chosen.") - self.split_criterion = CRITERION_END - else: - self.split_criterion = criterion_dict[str(split_criterion)] - - self.min_rows_per_node = min_rows_per_node - self.min_impurity_decrease = min_impurity_decrease - self.bootstrap_features = bootstrap_features - self.rows_sample = rows_sample - self.max_leaves = max_leaves - self.n_estimators = n_estimators - self.max_depth = max_depth - self.max_features = max_features - self.bootstrap = bootstrap - self.treelite_handle = None - self.n_bins = n_bins - self.quantile_per_tree = quantile_per_tree - self.n_cols = None - self.dtype = None - self.n_streams = handle.getNumInternalStreams() - self.seed = seed - self.num_classes = 2 + def __init__(self, split_criterion=0, seed=None, + n_streams=8, **kwargs): if ((seed is not None) and (n_streams != 1)): warnings.warn("For reproducible results, n_streams==1 is " "recommended. If n_streams is > 1, results may vary " "due to stream/thread timing differences, even when " "random_seed is set") - self.rf_forest = 0 - self.rf_forest64 = 0 - self.model_pbuf_bytes = bytearray() + + self.RF_type = CLASSIFICATION + self.num_classes = 2 + self._create_model(model=RandomForestClassifier, + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + **kwargs) """ TODO: @@ -385,86 +326,6 @@ class RandomForestClassifier(Base): self.model_pbuf_bytes = bytearray() self.n_cols = None - def _get_max_feat_val(self): - if type(self.max_features) == int: - return self.max_features/self.n_cols - elif type(self.max_features) == float: - return self.max_features - elif self.max_features == 'sqrt' or self.max_features == 'auto': - return 1/np.sqrt(self.n_cols) - elif self.max_features == 'log2': - return math.log2(self.n_cols)/self.n_cols - else: - raise ValueError("Wrong value passed in for max_features" - " please read the documentation") - - def _obtain_treelite_handle(self): - """Returns a handle to a treelite-formatted version of the model. - This will create a new treelite model if necessary, or return - a cached version when available. The handle is cached in the - instanced and freed at instance deletion. Caller should not - delete the returned model.""" - if self.treelite_handle is not None: - return self.treelite_handle # Cached version - - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - - assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \ - "Attempting to create treelite from un-fit forest." - - if self.num_classes > 2: - raise NotImplementedError("Pickling for multi-class " - "classification models is currently not " - "implemented. Please check cuml issue " - "#1679 for more information.") - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - task_category = CLASSIFICATION_MODEL - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - return self.treelite_handle - - def _get_protobuf_bytes(self): - """ - Returns the self.model_pbuf_bytes. - Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite - models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. - """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes - elif self.treelite_handle: - fit_mod_ptr = self.treelite_handle - else: - fit_mod_ptr = self._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes - def convert_to_treelite_model(self): """ Converts the cuML RF model to a Treelite model @@ -534,49 +395,6 @@ class RandomForestClassifier(Base): TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ - def _tl_model_handles(self, model_bytes): - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, int] *rf_forest = \ - self.rf_forest - task_category = CLASSIFICATION_MODEL - build_treelite_forest(& cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = cuml_model_ptr - - return ctypes.c_void_p(mod_handle).value - - def _concatenate_treelite_handle(self, treelite_handle): - cdef ModelHandle concat_model_handle = NULL - cdef vector[ModelHandle] *model_handles \ - = new vector[ModelHandle]() - cdef uintptr_t mod_ptr - for i in treelite_handle: - mod_ptr = i - model_handles.push_back(( - mod_ptr)) - - self._reset_forest_data() - concat_model_handle = concatenate_trees(deref(model_handles)) - cdef uintptr_t concat_model_ptr = concat_model_handle - self.treelite_handle = concat_model_ptr - - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - - # Fix up some instance variables that should match the new TL model - tl_model = TreeliteModel.from_treelite_model_handle( - self.treelite_handle, - take_handle_ownership=False) - self.n_cols = tl_model.num_features - self.n_estimators = tl_model.num_trees - - return self def fit(self, X, y, convert_dtype=False): """ @@ -599,50 +417,16 @@ class RandomForestClassifier(Base): memory used for the method. """ - self._set_output_type(X) - - # Reset the old tree data for new fit call - self._reset_forest_data() - + X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) cdef uintptr_t X_ptr, y_ptr - - X_m, n_rows, self.n_cols, self.dtype = \ - input_to_cuml_array(X, check_dtype=[np.float32, np.float64], - order='F') - if self.n_bins > n_rows: - raise ValueError("The number of bins,`n_bins` can not be greater" - " than the number of samples used for training.") X_ptr = X_m.ptr - - y_m, _, _, y_dtype = \ - input_to_cuml_array(y, check_dtype=np.int32, - convert_to_dtype=(np.int32 if convert_dtype - else None), - check_rows=n_rows, check_cols=1) y_ptr = y_m.ptr - if y_dtype != np.int32: - raise TypeError("The labels `y` need to be of dtype `np.int32`") - - if self.dtype == np.float64: - warnings.warn("To use GPU-based prediction, first train \ - using float 32 data to fit the estimator.") - cdef cumlHandle* handle_ =\ self.handle.getHandle() unique_labels = rmm_cupy_ary(cp.unique, y_m) num_unique_labels = len(unique_labels) - for i in range(num_unique_labels): - if i not in unique_labels: - raise ValueError("The labels need " - "to be consecutive values from " - "0 to the number of unique label values") - - max_feature_val = self._get_max_feat_val() - if type(self.min_rows_per_node) == float: - self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows) - cdef RandomForestMetaData[float, int] *rf_forest = \ new RandomForestMetaData[float, int]() self.rf_forest = rf_forest @@ -675,7 +459,7 @@ class RandomForestClassifier(Base): fit(handle_[0], rf_forest, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, num_unique_labels, @@ -687,7 +471,7 @@ class RandomForestClassifier(Base): fit(handle_[0], rf_forest64, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, num_unique_labels, @@ -706,43 +490,6 @@ class RandomForestClassifier(Base): self.num_classes = num_unique_labels return self - def _predict_model_on_gpu(self, X, output_class, - threshold, algo, - num_classes, convert_dtype, - fil_sparse_format, predict_proba): - out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL - _, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='F', - check_cols=self.n_cols) - - if dtype == np.float64 and not convert_dtype: - raise TypeError("GPU based predict only accepts np.float32 data. \ - Please set convert_dtype=True to convert the test \ - data to the same dtype as the data used to train, \ - ie. np.float32. If you would like to use test \ - data of dtype=np.float64 please set \ - predict_model='CPU' to use the CPU implementation \ - of predict.") - - treelite_handle = self._obtain_treelite_handle() - - storage_type = \ - _check_fil_parameter_validity(depth=self.max_depth, - fil_sparse_format=fil_sparse_format, - algo=algo) - fil_model = ForestInference() - tl_to_fil_model = \ - fil_model.load_using_treelite_handle(treelite_handle, - output_class=output_class, - threshold=threshold, - algo=algo, - storage_type=storage_type) - - preds = tl_to_fil_model.predict(X, output_type=out_type, - predict_proba=predict_proba) - return preds - def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr @@ -794,7 +541,7 @@ class RandomForestClassifier(Base): def predict(self, X, predict_model="GPU", output_class=True, threshold=0.5, algo='auto', - num_classes=2, convert_dtype=True, + convert_dtype=True, fil_sparse_format='auto'): """ Predicts the labels for X. @@ -832,7 +579,8 @@ class RandomForestClassifier(Base): while performing the predict operation on the GPU. It is applied if output_class == True, else it is ignored num_classes : int (default = 2) - number of different classes present in the dataset + number of different classes present in the dataset. This variable + will be depricated in 0.16 convert_dtype : bool, optional (default = True) When set to True, the predict method will, when necessary, convert the input to the data type which was used to train the model. This @@ -872,7 +620,6 @@ class RandomForestClassifier(Base): self._predict_model_on_gpu(X, output_class=output_class, threshold=threshold, algo=algo, - num_classes=num_classes, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format, predict_proba=False) @@ -941,7 +688,7 @@ class RandomForestClassifier(Base): def predict_proba(self, X, output_class=True, threshold=0.5, algo='auto', - num_classes=2, convert_dtype=True, + convert_dtype=True, fil_sparse_format='auto'): """ Predicts class probabilites for X. This function uses the GPU @@ -1018,7 +765,6 @@ class RandomForestClassifier(Base): self._predict_model_on_gpu(X, output_class=output_class, threshold=threshold, algo=algo, - num_classes=num_classes, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format, predict_proba=True) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index dc0a4f85f8..f684f69ccc 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -37,6 +37,7 @@ from cuml.common.handle import Handle from cuml.common.handle cimport cumlHandle from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model +from cuml.ensemble.randomforest_common import BaseRandomForestModel from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel @@ -103,7 +104,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": int) except + -class RandomForestRegressor(Base): +class RandomForestRegressor(BaseRandomForestModel): """ Implements a Random Forest regressor model which fits multiple decision @@ -218,81 +219,15 @@ class RandomForestRegressor(Base): 'max_leaves', 'quantile_per_tree', 'accuracy_metric'] - def __init__(self, n_estimators=100, max_depth=16, handle=None, - max_features='auto', n_bins=8, n_streams=8, - split_algo=1, split_criterion=2, - bootstrap=True, bootstrap_features=False, - verbose=False, min_rows_per_node=2, - rows_sample=1.0, max_leaves=-1, - accuracy_metric='mse', output_type=None, - min_samples_leaf=None, dtype=None, - min_weight_fraction_leaf=None, n_jobs=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, - random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None, seed=None): - sklearn_params = {"criterion": criterion, - "min_samples_leaf": min_samples_leaf, - "min_weight_fraction_leaf": min_weight_fraction_leaf, - "max_leaf_nodes": max_leaf_nodes, - "min_impurity_split": min_impurity_split, - "oob_score": oob_score, "n_jobs": n_jobs, - "random_state": random_state, - "warm_start": warm_start, - "class_weight": class_weight} - - for key, vals in sklearn_params.items(): - if vals is not None: - raise TypeError(" The Scikit-learn variable ", key, - " is not supported in cuML," - " please read the cuML documentation for" - " more information") - - if handle is None: - handle = Handle(n_streams) - - super(RandomForestRegressor, self).__init__(handle=handle, - verbose=verbose, - output_type=output_type) - - if max_depth < 0: - raise ValueError("Must specify max_depth >0 ") - - self.split_algo = split_algo - criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} - if str(split_criterion) not in criterion_dict.keys(): - warnings.warn("The split criterion chosen was not present" - " in the list of options accepted by the model" - " and so the CRITERION_END option has been chosen.") - self.split_criterion = CRITERION_END - else: - self.split_criterion = criterion_dict[str(split_criterion)] - - self.min_rows_per_node = min_rows_per_node - self.min_impurity_decrease = min_impurity_decrease - self.bootstrap_features = bootstrap_features - self.rows_sample = rows_sample - self.max_leaves = max_leaves - self.n_estimators = n_estimators - self.max_depth = max_depth - self.max_features = max_features - self.bootstrap = bootstrap - self.n_bins = n_bins - self.n_cols = None - self.dtype = None - self.treelite_handle = None - self.accuracy_metric = accuracy_metric - self.quantile_per_tree = quantile_per_tree - self.n_streams = handle.getNumInternalStreams() - self.seed = seed - if ((seed is not None) and (n_streams != 1)): - warnings.warn("Setting the random seed does not fully guarantee" - " the exact same results at this time.") - self.rf_forest = None - self.rf_forest64 = None - self.model_pbuf_bytes = bytearray() - + def __init__(self, split_criterion=2, seed=None, + accuracy_metric='mse', n_streams=8, + **kwargs): + self.RF_type = REGRESSION + self._create_model(model=RandomForestRegressor, + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + accuracy_metric=accuracy_metric, + **kwargs) """ TODO: Add the preprocess and postprocess functions @@ -369,82 +304,6 @@ class RandomForestRegressor(Base): self.model_pbuf_bytes = bytearray() self.n_cols = None - def _get_max_feat_val(self): - if type(self.max_features) == int: - return self.max_features/self.n_cols - elif type(self.max_features) == float: - return self.max_features - elif self.max_features == 'sqrt': - return 1/np.sqrt(self.n_cols) - elif self.max_features == 'auto': - return 1.0 - elif self.max_features == 'log2': - return math.log2(self.n_cols)/self.n_cols - else: - raise ValueError("Wrong value passed in for max_features" - " please read the documentation") - - def _obtain_treelite_handle(self): - """Returns a handle to a treelite-formatted version of the model. - This will create a new treelite model if necessary, or return - a cached version when available. The handle is cached in the - instanced and freed at instance deletion. Caller should not - delete the returned model.""" - if self.treelite_handle is not None: - return self.treelite_handle # Cached version - - cdef ModelHandle cuml_model_ptr = NULL - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - assert len(self.model_pbuf_bytes) > 0 or self.rf_forest, \ - "Attempting to create treelite from un-fit forest." - - cdef unsigned char[::1] model_pbuf_mv = self.model_pbuf_bytes - cdef vector[unsigned char] model_pbuf_vec - with cython.boundscheck(False): - model_pbuf_vec.assign(& model_pbuf_mv[0], - & model_pbuf_mv[model_pbuf_mv.shape[0]]) - - task_category = REGRESSION_MODEL - build_treelite_forest( - & cuml_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_pbuf_vec) - mod_ptr = cuml_model_ptr - self.treelite_handle = ctypes.c_void_p(mod_ptr).value - return self.treelite_handle - - def _get_protobuf_bytes(self): - """ - Returns the self.model_pbuf_bytes. - Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite - models handle (pointer) is returned - 2. The treelite model handle is used to convert the treelite model - to a treelite protobuf model which is stored in a temporary file. - The protobuf model information is read from the temporary file and - the byte information is returned. - The treelite handle is stored `self.treelite_handle` and the treelite - protobuf model bytes are stored in `self.model_pbuf_bytes`. If either - of information is already present in the model then the respective - step is skipped. - """ - if self.model_pbuf_bytes: - return self.model_pbuf_bytes - elif self.treelite_handle: - fit_mod_ptr = self.treelite_handle - else: - fit_mod_ptr = self._obtain_treelite_handle() - cdef uintptr_t model_ptr = fit_mod_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - return self.model_pbuf_bytes - def convert_to_treelite_model(self): """ Converts the cuML RF model to a Treelite model @@ -511,48 +370,6 @@ class RandomForestRegressor(Base): TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. """ - def _tl_model_handles(self, model_bytes): - task_category = REGRESSION_MODEL - cdef ModelHandle tl_model_ptr = NULL - cdef RandomForestMetaData[float, float] *rf_forest = \ - self.rf_forest - build_treelite_forest(& tl_model_ptr, - rf_forest, - self.n_cols, - task_category, - model_bytes) - mod_handle = tl_model_ptr - - return ctypes.c_void_p(mod_handle).value - - def _concatenate_treelite_handle(self, treelite_handle): - cdef ModelHandle concat_model_handle = NULL - cdef vector[ModelHandle] *model_handles \ - = new vector[ModelHandle]() - cdef uintptr_t mod_ptr - for i in treelite_handle: - mod_ptr = i - model_handles.push_back(( - mod_ptr)) - - self._reset_forest_data() - concat_model_handle = concatenate_trees(deref(model_handles)) - cdef uintptr_t concat_model_ptr = concat_model_handle - self.treelite_handle = concat_model_ptr - cdef vector[unsigned char] pbuf_mod_info = \ - save_model( concat_model_ptr) - cdef unsigned char[::1] pbuf_mod_view = \ - pbuf_mod_info.data() - self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) - - # Fix up some instance variables that should match the new TL model - tl_model = TreeliteModel.from_treelite_model_handle( - self.treelite_handle, - take_handle_ownership=False) - self.n_cols = tl_model.num_features - self.n_estimators = tl_model.num_trees - - return self def fit(self, X, y, convert_dtype=False): """ @@ -570,25 +387,11 @@ class RandomForestRegressor(Base): ndarray, cuda array interface compliant array like CuPy These labels should be contiguous integers from 0 to n_classes. """ - self._set_output_type(X) + X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) # Reset the old tree data for new fit call - self._reset_forest_data() - cdef uintptr_t X_ptr, y_ptr - - X_m, n_rows, self.n_cols, self.dtype = \ - input_to_cuml_array(X, check_dtype=[np.float32, np.float64], - order='F') - if self.n_bins > n_rows: - raise ValueError("The number of bins,`n_bins` can not be greater" - " than the number of samples used for training.") X_ptr = X_m.ptr - y_m, _, _, y_dtype = \ - input_to_cuml_array(y, - convert_to_dtype=(self.dtype if convert_dtype - else None), - check_rows=n_rows, check_cols=1) y_ptr = y_m.ptr if self.dtype == np.float64: @@ -598,10 +401,6 @@ class RandomForestRegressor(Base): cdef cumlHandle* handle_ =\ self.handle.getHandle() - max_feature_val = self._get_max_feat_val() - if type(self.min_rows_per_node) == float: - self.min_rows_per_node = math.ceil(self.min_rows_per_node*n_rows) - cdef RandomForestMetaData[float, float] *rf_forest = \ new RandomForestMetaData[float, float]() self.rf_forest = rf_forest @@ -634,7 +433,7 @@ class RandomForestRegressor(Base): fit(handle_[0], rf_forest, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, rf_params, @@ -645,7 +444,7 @@ class RandomForestRegressor(Base): fit(handle_[0], rf_forest64, X_ptr, - n_rows, + self.n_rows, self.n_cols, y_ptr, rf_params64, @@ -657,40 +456,6 @@ class RandomForestRegressor(Base): del(y_m) return self - def _predict_model_on_gpu(self, X, algo, convert_dtype, - fil_sparse_format): - out_type = self._get_output_type(X) - cdef ModelHandle cuml_model_ptr = NULL - _, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='F', - check_cols=self.n_cols) - - if dtype == np.float64 and not convert_dtype: - raise TypeError("GPU based predict only accepts np.float32 data. \ - Please set convert_dtype=True to convert the test \ - data to the same dtype as the data used to train, \ - ie. np.float32. If you would like to use test \ - data of dtype=np.float64 please set \ - predict_model='CPU' to use the CPU implementation \ - of predict.") - - treelite_handle = self._obtain_treelite_handle() - - storage_type = \ - _check_fil_parameter_validity(depth=self.max_depth, - fil_sparse_format=fil_sparse_format, - algo=algo) - - fil_model = ForestInference() - tl_to_fil_model = \ - fil_model.load_using_treelite_handle(treelite_handle, - output_class=False, - algo=algo, - storage_type=storage_type) - - preds = tl_to_fil_model.predict(X, out_type) - return preds - def _predict_model_on_cpu(self, X, convert_dtype): out_type = self._get_output_type(X) cdef uintptr_t X_ptr From cff16d74ab566548d4695615ad7b0c13fd722104 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 07:31:56 -0500 Subject: [PATCH 08/32] style check --- cpp/include/cuml/ensemble/randomforest.hpp | 2 +- cpp/src/randomforest/randomforest.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/include/cuml/ensemble/randomforest.hpp b/cpp/include/cuml/ensemble/randomforest.hpp index defe869347..84c951dd0c 100644 --- a/cpp/include/cuml/ensemble/randomforest.hpp +++ b/cpp/include/cuml/ensemble/randomforest.hpp @@ -217,4 +217,4 @@ RF_metrics score(const cumlHandle& user_handle, const RandomForestRegressorD* forest, const double* ref_labels, int n_rows, const double* predictions, int verbosity = CUML_LEVEL_INFO); -}; // namespace ML \ No newline at end of file +}; // namespace ML diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 625b1b37d4..1a17bbe0f8 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -795,4 +795,4 @@ template void build_treelite_forest( template void build_treelite_forest( ModelHandle* model, const RandomForestMetaData* forest, int num_features, int task_category, std::vector& data); -} // End namespace ML \ No newline at end of file +} // End namespace ML From 0fc2df19e9eff63fc44c7134760338e78a277fe8 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 09:02:35 -0500 Subject: [PATCH 09/32] fix style errors --- python/cuml/ensemble/randomforest_common.pyx | 39 ++++++++++--------- .../cuml/ensemble/randomforestclassifier.pyx | 6 +-- .../cuml/ensemble/randomforestregressor.pyx | 18 ++++----- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index e91f1262cb..9275a4ff2e 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -33,7 +33,7 @@ from cuml.common import input_to_cuml_array, rmm_cupy_ary cimport cython -# create a cdef class and cdef func which will call the C++ cdef func and then return the required handle and stuff + class BaseRandomForestModel(Base): variables = ['n_estimators', 'max_depth', 'handle', 'max_features', 'n_bins', @@ -166,11 +166,9 @@ class BaseRandomForestModel(Base): self.model_pbuf_bytes = bytearray(memoryview(pbuf_mod_view)) return self.model_pbuf_bytes - def _obtain_treelite_handle(self): if self.treelite_handle: - print(" treelite handle in obt : ", self.treelite_handle) - return self.treelite_handle # Use cached version + return self.treelite_handle # Use cached version cdef ModelHandle cuml_model_ptr = NULL cdef unsigned char[::1] model_pbuf_mv cdef vector[unsigned char] model_pbuf_vec @@ -180,7 +178,7 @@ class BaseRandomForestModel(Base): model_pbuf_vec.assign(& model_pbuf_mv[0], & model_pbuf_mv[model_pbuf_mv.shape[0]]) else: - model_pbuf_vec = bytearray() + model_pbuf_vec = bytearray() if self.RF_type == CLASSIFICATION: build_treelite_forest( & cuml_model_ptr, @@ -214,12 +212,14 @@ class BaseRandomForestModel(Base): " than the number of samples used for training.") if self.RF_type == CLASSIFICATION: y_m, _, _, y_dtype = \ - input_to_cuml_array(y, check_dtype=np.int32, - convert_to_dtype=(np.int32 if convert_dtype - else None), - check_rows=self.n_rows, check_cols=1) + input_to_cuml_array( + y, check_dtype=np.int32, + convert_to_dtype=(np.int32 if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) if y_dtype != np.int32: - raise TypeError("The labels `y` need to be of dtype `np.int32`") + raise TypeError("The labels `y` need to be of dtype" + " `np.int32`") unique_labels = rmm_cupy_ary(cp.unique, y_m) self.num_classes = len(unique_labels) for i in range(self.num_classes): @@ -229,10 +229,11 @@ class BaseRandomForestModel(Base): "0 to the number of unique label values") else: y_m, _, _, y_dtype = \ - input_to_cuml_array(y, - convert_to_dtype=(self.dtype if convert_dtype - else None), - check_rows=self.n_rows, check_cols=1) + input_to_cuml_array( + y, + convert_to_dtype=(self.dtype if convert_dtype + else None), + check_rows=self.n_rows, check_cols=1) if self.dtype == np.float64: warnings.warn("To use GPU-based prediction, first train using \ @@ -240,11 +241,12 @@ class BaseRandomForestModel(Base): max_feature_val = self._get_max_feat_val() if type(self.min_rows_per_node) == float: - self.min_rows_per_node = math.ceil(self.min_rows_per_node*self.n_rows) + self.min_rows_per_node = \ + math.ceil(self.min_rows_per_node*self.n_rows) del X del y return X_m, y_m, max_feature_val - + def _tl_model_handles(self, model_bytes): cdef ModelHandle cuml_model_ptr = NULL if self.RF_type == CLASSIFICATION: @@ -265,7 +267,6 @@ class BaseRandomForestModel(Base): return ctypes.c_void_p(mod_handle).value - def _concatenate_treelite_handle(self, treelite_handle): cdef ModelHandle concat_model_handle = NULL cdef vector[ModelHandle] *model_handles \ @@ -293,7 +294,6 @@ class BaseRandomForestModel(Base): self.n_estimators = tl_model.num_trees return self - def _predict_model_on_gpu(self, X, algo, convert_dtype, fil_sparse_format, threshold=0.5, output_class=False, predict_proba=False): @@ -329,7 +329,7 @@ class BaseRandomForestModel(Base): preds = tl_to_fil_model.predict(X, output_type=out_type, predict_proba=predict_proba) return preds - + def _get_params(self, model, deep): params = dict() for key in model.variables: @@ -352,6 +352,7 @@ class BaseRandomForestModel(Base): setattr(self, key, value) return self + def _check_fil_parameter_validity(depth, algo, fil_sparse_format): storage_format = _check_fil_sparse_format_value(fil_sparse_format) if (depth > 16 and (storage_format == 'dense' or diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 3dcd7c87b6..205fb5b0c4 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -880,8 +880,6 @@ class RandomForestClassifier(BaseRandomForestModel): ----------- deep : boolean (default = True) """ - - return self._get_params(model=RandomForestClassifier, deep=deep) @@ -895,9 +893,7 @@ class RandomForestClassifier(BaseRandomForestModel): ----------- params : dict of new params """ - # Resetting handle as __setstate__ overwrites with handle=None - - + # Resetting handle as __setstate__ overwrites with handle=Non return self._set_params(model=RandomForestClassifier, **params) diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 5089e8afb9..2f41898113 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -217,10 +217,10 @@ class RandomForestRegressor(BaseRandomForestModel): **kwargs): self.RF_type = REGRESSION self._create_model(model=RandomForestRegressor, - split_criterion=split_criterion, - seed=seed, n_streams=n_streams, - accuracy_metric=accuracy_metric, - **kwargs) + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + accuracy_metric=accuracy_metric, + **kwargs) """ TODO: Add the preprocess and postprocess functions @@ -356,7 +356,6 @@ class RandomForestRegressor(BaseRandomForestModel): algo=algo, fil_sparse_format=fil_sparse_format) - """ TODO : Move functions duplicated in the RF classifier and regressor to a shared file. Cuml issue #1854 has been created to track this. @@ -549,10 +548,11 @@ class RandomForestRegressor(BaseRandomForestModel): setting predict_model = 'CPU'") else: - preds = self._predict_model_on_gpu(X=X, - algo=algo, - convert_dtype=convert_dtype, - fil_sparse_format=fil_sparse_format) + preds = self._predict_model_on_gpu( + X=X, + algo=algo, + convert_dtype=convert_dtype, + fil_sparse_format=fil_sparse_format) return preds From 0c02d92ba9060795c0d698c3b89a274d285a1894 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 09:14:14 -0500 Subject: [PATCH 10/32] update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b4d2bd61b2..b3612a036a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - PR #2310: Pinning ucx-py to 0.14 to make 0.15 CI pass - PR #1945: enable clang tidy +- PR #2237: Refactor RF cython code ## Bug Fixes From 2dc70fd0b7db82e4141d23e5a98c1568124c8698 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 16:11:26 -0500 Subject: [PATCH 11/32] updated code --- python/cuml/ensemble/randomforest_common.pyx | 30 ++++--- .../cuml/ensemble/randomforestclassifier.pyx | 85 ++++++++++--------- .../cuml/ensemble/randomforestregressor.pyx | 52 ++++++------ 3 files changed, 89 insertions(+), 78 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 9275a4ff2e..23d99ad492 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -43,7 +43,7 @@ class BaseRandomForestModel(Base): 'verbose', 'rows_sample', 'max_leaves', 'quantile_per_tree'] - def _create_model(self, model, seed, split_criterion, + def _create_model(self, seed, split_criterion, n_streams, n_estimators=100, max_depth=16, handle=None, max_features='auto', n_bins=8, split_algo=1, bootstrap=True, @@ -59,7 +59,8 @@ class BaseRandomForestModel(Base): quantile_per_tree=False, criterion=None): if accuracy_metric: - model.variables.append('accuracy_metric') + BaseRandomForestModel.variables.append('accuracy_metric') + sklearn_params = {"criterion": criterion, "min_samples_leaf": min_samples_leaf, "min_weight_fraction_leaf": min_weight_fraction_leaf, @@ -71,7 +72,7 @@ class BaseRandomForestModel(Base): "class_weight": class_weight} for key, vals in sklearn_params.items(): - if vals is not None: + if vals: raise TypeError(" The Scikit-learn variable ", key, " is not supported in cuML," " please read the cuML documentation for" @@ -80,9 +81,11 @@ class BaseRandomForestModel(Base): if handle is None: handle = Handle(n_streams) - super(model, self).__init__(handle=handle, - verbose=verbose, - output_type=output_type) + super(BaseRandomForestModel, self).__init__( + handle=handle, + verbose=verbose, + output_type=output_type) + if max_depth < 0: raise ValueError("Must specify max_depth >0 ") @@ -152,6 +155,9 @@ class BaseRandomForestModel(Base): of information is already present in the model then the respective step is skipped. """ + if self.dtype == np.float64: + raise TypeError("To use pickling, first train the model" + " using float 32 data.") if self.model_pbuf_bytes: return self.model_pbuf_bytes elif self.treelite_handle: @@ -179,6 +185,7 @@ class BaseRandomForestModel(Base): & model_pbuf_mv[model_pbuf_mv.shape[0]]) else: model_pbuf_vec = bytearray() + if self.RF_type == CLASSIFICATION: build_treelite_forest( & cuml_model_ptr, @@ -198,7 +205,7 @@ class BaseRandomForestModel(Base): self.treelite_handle = ctypes.c_void_p(mod_ptr).value return self.treelite_handle - def _dataset_setup(self, X, y, convert_dtype): + def _dataset_setup_for_fit(self, X, y, convert_dtype): self._set_output_type(X) # Reset the old tree data for new fit call @@ -210,6 +217,7 @@ class BaseRandomForestModel(Base): if self.n_bins > self.n_rows: raise ValueError("The number of bins,`n_bins` can not be greater" " than the number of samples used for training.") + if self.RF_type == CLASSIFICATION: y_m, _, _, y_dtype = \ input_to_cuml_array( @@ -330,23 +338,23 @@ class BaseRandomForestModel(Base): predict_proba=predict_proba) return preds - def _get_params(self, model, deep): + def _get_params(self, deep): params = dict() - for key in model.variables: + for key in BaseRandomForestModel.variables: if key in ['handle']: continue var_value = getattr(self, key, None) params[key] = var_value return params - def _set_params(self, model, **params): + def _set_params(self, **params): self.handle.__setstate__(self.n_streams) self.model_pbuf_bytes = [] if not params: return self for key, value in params.items(): - if key not in model.variables: + if key not in BaseRandomForestModel.variables: raise ValueError('Invalid parameter for estimator') else: setattr(self, key, value) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 205fb5b0c4..9a20b7da06 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -21,38 +21,34 @@ # cython: language_level = 3 import ctypes -import cudf -import cupy as cp -import math import numpy as np import rmm import warnings -from libcpp cimport bool -from libcpp.vector cimport vector -from libc.stdint cimport uintptr_t -from libc.stdlib cimport calloc, malloc, free - -from cython.operator cimport dereference as deref +import cuml.common.logger as logger from cuml import ForestInference -from cuml.fil.fil import TreeliteModel - from cuml.common.array import CumlArray from cuml.common.handle import Handle -from cuml.ensemble.randomforest_common import BaseRandomForestModel +from cuml.common import input_to_cuml_array, rmm_cupy_ary -from cuml.common.handle cimport cumlHandle -from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ - _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model from cuml.ensemble.randomforest_common import BaseRandomForestModel - +from cuml.ensemble.randomforest_common import _obtain_treelite_model, \ + _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * -import cuml.common.logger as logger -from cuml.common import input_to_cuml_array, rmm_cupy_ary + +from cuml.fil.fil import TreeliteModel + +from cython.operator cimport dereference as deref + +from libcpp cimport bool +from libcpp.vector cimport vector +from libc.stdint cimport uintptr_t +from libc.stdlib cimport calloc, malloc, free from numba import cuda +from cuml.common.handle cimport cumlHandle cimport cuml.common.handle cimport cuml.common.cuda @@ -238,10 +234,10 @@ class RandomForestClassifier(BaseRandomForestModel): self.RF_type = CLASSIFICATION self.num_classes = 2 - self._create_model(model=RandomForestClassifier, - split_criterion=split_criterion, - seed=seed, n_streams=n_streams, - **kwargs) + super(RandomForestClassifier, self)._create_model( + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + **kwargs) """ TODO: @@ -413,7 +409,8 @@ class RandomForestClassifier(BaseRandomForestModel): """ - X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) + X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, + convert_dtype) cdef uintptr_t X_ptr, y_ptr X_ptr = X_m.ptr y_ptr = y_m.ptr @@ -532,7 +529,7 @@ class RandomForestClassifier(BaseRandomForestModel): def predict(self, X, predict_model="GPU", output_class=True, threshold=0.5, - algo='auto', + algo='auto', num_classes=None, convert_dtype=True, fil_sparse_format='auto'): """ @@ -570,7 +567,7 @@ class RandomForestClassifier(BaseRandomForestModel): Threshold used for classification. Optional and required only while performing the predict operation on the GPU. It is applied if output_class == True, else it is ignored - num_classes : int (default = 2) + num_classes : int (default = None) number of different classes present in the dataset. This variable will be depricated in 0.16 convert_dtype : bool, optional (default = True) @@ -592,7 +589,12 @@ class RandomForestClassifier(BaseRandomForestModel): y : NumPy Dense vector (int) of shape (n_samples, 1) """ - if predict_model == "CPU" or self.num_classes > 2: + if (num_classes and self.num_classes != num_classes): + raise ValueError("The number of classes in the test dataset" + " should be equal to the number of classes" + " present in the training dataset.") + + elif predict_model == "CPU" or self.num_classes > 2: if self.num_classes > 2 and predict_model == "GPU": warnings.warn("Switching over to use the CPU predict since " "the GPU predict currently cannot perform " @@ -681,7 +683,8 @@ class RandomForestClassifier(BaseRandomForestModel): def predict_proba(self, X, output_class=True, threshold=0.5, algo='auto', convert_dtype=True, - fil_sparse_format='auto'): + fil_sparse_format='auto', + num_classes=None): """ Predicts class probabilites for X. This function uses the GPU implementation of predict. Therefore, data with 'dtype = np.float32' @@ -717,8 +720,9 @@ class RandomForestClassifier(BaseRandomForestModel): Threshold used for classification. Optional and required only while performing the predict operation on the GPU. It is applied if output_class == True, else it is ignored - num_classes : int (default = 2) - number of different classes present in the dataset + num_classes : int (default = None) + number of different classes present in the dataset. This variable + will be depricated in 0.16 convert_dtype : bool, optional (default = True) When set to True, the predict method will, when necessary, convert the input to the data type which was used to train the model. This @@ -753,6 +757,11 @@ class RandomForestClassifier(BaseRandomForestModel): "classification models is currently not " "implemented. Please check cuml issue " "#1679 for more information.") + + elif (num_classes and self.num_classes != num_classes): + raise ValueError("The number of classes in the test dataset" + " should be equal to the number of classes" + " present in the training dataset.") preds_proba = \ self._predict_model_on_gpu(X, output_class=output_class, threshold=threshold, @@ -764,7 +773,7 @@ class RandomForestClassifier(BaseRandomForestModel): return preds_proba def score(self, X, y, threshold=0.5, - algo='auto', num_classes=2, predict_model="GPU", + algo='auto', num_classes=None, predict_model="GPU", convert_dtype=True, fil_sparse_format='auto'): """ Calculates the accuracy metric score of the model for X. @@ -792,8 +801,9 @@ class RandomForestClassifier(BaseRandomForestModel): threshold is used to for classification This is optional and required only while performing the predict operation on the GPU. - num_classes : integer - number of different classes present in the dataset + num_classes : int (default = None) + number of different classes present in the dataset. This variable + will be depricated in 0.16 convert_dtype : boolean, default=True whether to convert input data to correct dtype automatically predict_model : String (default = 'GPU') @@ -831,6 +841,7 @@ class RandomForestClassifier(BaseRandomForestModel): threshold=threshold, algo=algo, convert_dtype=convert_dtype, predict_model=predict_model, + num_classes=num_classes, fil_sparse_format=fil_sparse_format) cdef uintptr_t preds_ptr @@ -875,27 +886,23 @@ class RandomForestClassifier(BaseRandomForestModel): """ Returns the value of all parameters required to configure this estimator as a dictionary. - Parameters ----------- deep : boolean (default = True) """ - return self._get_params(model=RandomForestClassifier, - deep=deep) + return self._get_params(deep=deep) def set_params(self, **params): """ Sets the value of parameters required to configure this estimator, it functions similar to the sklearn set_params. - Parameters ----------- params : dict of new params """ - # Resetting handle as __setstate__ overwrites with handle=Non - return self._set_params(model=RandomForestClassifier, - **params) + # Resetting handle as __setstate__ overwrites with handle=None + return self._set_params(**params) def print_summary(self): """ diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 2f41898113..3e5317660e 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -20,34 +20,34 @@ # cython: language_level = 3 import ctypes -import cudf -import math import numpy as np +import rmm import warnings -from libcpp cimport bool -from libcpp.vector cimport vector -from libc.stdint cimport uintptr_t -from libc.stdlib cimport calloc, malloc, free +import cuml.common.logger as logger from cuml import ForestInference -from cuml.fil.fil import TreeliteModel from cuml.common.array import CumlArray from cuml.common.handle import Handle -from cuml.ensemble.randomforest_common import BaseRandomForestModel -from cuml.common.handle cimport cumlHandle -from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ - _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model -from cuml.ensemble.randomforest_common import BaseRandomForestModel +from cuml.common import input_to_cuml_array, rmm_cupy_ary +from cuml.ensemble.randomforest_common import BaseRandomForestModel +from cuml.ensemble.randomforest_common import _obtain_treelite_model, \ + _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * -from cuml.common import input_to_cuml_array -import cuml.common.logger as logger + +from cuml.fil.fil import TreeliteModel from cython.operator cimport dereference as deref +from libcpp cimport bool +from libcpp.vector cimport vector +from libc.stdint cimport uintptr_t +from libc.stdlib cimport calloc, malloc, free + from numba import cuda +from cuml.common.handle cimport cumlHandle cimport cuml.common.handle cimport cuml.common.cuda @@ -216,11 +216,11 @@ class RandomForestRegressor(BaseRandomForestModel): accuracy_metric='mse', n_streams=8, **kwargs): self.RF_type = REGRESSION - self._create_model(model=RandomForestRegressor, - split_criterion=split_criterion, - seed=seed, n_streams=n_streams, - accuracy_metric=accuracy_metric, - **kwargs) + super(RandomForestRegressor, self)._create_model( + split_criterion=split_criterion, + seed=seed, n_streams=n_streams, + accuracy_metric=accuracy_metric, + **kwargs) """ TODO: Add the preprocess and postprocess functions @@ -306,7 +306,6 @@ class RandomForestRegressor(BaseRandomForestModel): tl_to_fil_model : Treelite version of this model """ handle = self._obtain_treelite_handle() - return _obtain_treelite_model(handle) def convert_to_fil_model(self, output_class=False, @@ -317,7 +316,7 @@ class RandomForestRegressor(BaseRandomForestModel): Random Forest model. Parameters ---------- - output_class : boolean (default = True) + output_class : boolean (default = False) This is optional and required only while performing the predict operation on the GPU. If true, return a 1 or 0 depending on whether the raw @@ -377,7 +376,8 @@ class RandomForestRegressor(BaseRandomForestModel): ndarray, cuda array interface compliant array like CuPy These labels should be contiguous integers from 0 to n_classes. """ - X_m, y_m, max_feature_val = self._dataset_setup(X, y, convert_dtype) + X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, + convert_dtype) # Reset the old tree data for new fit call cdef uintptr_t X_ptr, y_ptr @@ -663,27 +663,23 @@ class RandomForestRegressor(BaseRandomForestModel): """ Returns the value of all parameters required to configure this estimator as a dictionary. - Parameters ----------- deep : boolean (default = True) """ - return self._get_params(model=RandomForestRegressor, - deep=deep) + return self._get_params(deep=deep) def set_params(self, **params): """ Sets the value of parameters required to configure this estimator, it functions similar to the sklearn set_params. - Parameters ----------- params : dict of new params """ # Resetting handle as __setstate__ overwrites with handle=None - return self._set_params(model=RandomForestRegressor, - **params) + return self._set_params(**params) def print_summary(self): """ From 728600f3dd01cc101d920a8995c002fdae265898 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 29 May 2020 17:35:27 -0500 Subject: [PATCH 12/32] fixed style errors --- python/cuml/ensemble/randomforest_shared.pxd | 31 +++---------------- .../cuml/ensemble/randomforestregressor.pyx | 4 +-- 2 files changed, 7 insertions(+), 28 deletions(-) diff --git a/python/cuml/ensemble/randomforest_shared.pxd b/python/cuml/ensemble/randomforest_shared.pxd index 03665439e8..0531685efa 100644 --- a/python/cuml/ensemble/randomforest_shared.pxd +++ b/python/cuml/ensemble/randomforest_shared.pxd @@ -33,12 +33,10 @@ from cuml.common.handle import Handle from cuml import ForestInference from cuml.common.base import Base from cuml.common.handle cimport cumlHandle -from cuml.utils import get_cudf_column_ptr, get_dev_array_ptr, \ +from cuml.common import get_cudf_column_ptr, get_dev_array_ptr, \ input_to_dev_array, zeros cimport cuml.common.handle cimport cuml.common.cuda -cimport cython - cdef extern from "treelite/c_api.h": ctypedef void* ModelHandle @@ -47,7 +45,7 @@ cdef extern from "treelite/c_api.h": ModelHandle model) cdef const char* TreeliteGetLastError() -cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: +cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": cdef enum CRITERION: GINI, ENTROPY, @@ -55,15 +53,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: MAE, CRITERION_END -cdef extern from "cuml/tree/flatnode.h" namespace "ML::Flatnode" nogil: - cdef cppclass SparseTreeNode[T, L]: - L prediction - int colid - T quesval - T best_metric_val - int left_child_id - -cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil: +cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree": cdef struct DecisionTreeParams: int max_depth int max_leaves @@ -75,15 +65,7 @@ cdef extern from "cuml/tree/decisiontree.hpp" namespace "ML::DecisionTree" nogil bool quantile_per_tree CRITERION split_criterion - cdef cppclass TreeMetaDataNode[T, L]: - int treeid - int depth_counter - int leaf_counter - double prepare_time - double train_time - vector[SparseTreeNode[T, L]] sparsetree - -cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: +cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": cdef enum RF_type: CLASSIFICATION, @@ -108,20 +90,18 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: pass cdef cppclass RandomForestMetaData[T, L]: - ctypedef TreeMetaDataNode[T, L]* trees + void* trees RF_params rf_params # # Treelite handling # - cdef void build_treelite_forest[T, L](ModelHandle*, RandomForestMetaData[T, L]*, int, int, vector[unsigned char] &) except + - cdef vector[unsigned char] save_model_protobuf(ModelHandle) except + cdef void delete_rf_metadata[T, L](RandomForestMetaData[T, L]*) except + @@ -148,4 +128,3 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML" nogil: cdef ModelHandle concatenate_trees( vector[ModelHandle] &treelite_handles) except + - diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 3e5317660e..92f4522ab4 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -55,7 +55,7 @@ cimport cython cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": - + cdef void fit(cumlHandle & handle, RandomForestMetaData[float, float]*, float*, @@ -73,7 +73,7 @@ cdef extern from "cuml/ensemble/randomforest.hpp" namespace "ML": double*, RF_params, int) except + - + cdef void predict(cumlHandle& handle, RandomForestMetaData[float, float] *, float*, From a7072d76f1823a7bdec94eeead01a1e5799a4313 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Sun, 31 May 2020 17:05:02 -0500 Subject: [PATCH 13/32] add error to obtain_treelite_handle func --- python/cuml/ensemble/randomforest_common.pyx | 6 ++++++ python/cuml/ensemble/randomforestclassifier.pyx | 1 + 2 files changed, 7 insertions(+) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 23d99ad492..af5166973f 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -187,6 +187,12 @@ class BaseRandomForestModel(Base): model_pbuf_vec = bytearray() if self.RF_type == CLASSIFICATION: + if self.num_classes > 2: + raise NotImplementedError("Pickling for multi-class " + "classification models is currently" + " not implemented. Please check" + " cuml issue #1679 for more" + " information.") build_treelite_forest( & cuml_model_ptr, self.rf_forest, diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 9a20b7da06..6266ab57b5 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -594,6 +594,7 @@ class RandomForestClassifier(BaseRandomForestModel): " should be equal to the number of classes" " present in the training dataset.") + elif predict_model == "CPU" or self.num_classes > 2: if self.num_classes > 2 and predict_model == "GPU": warnings.warn("Switching over to use the CPU predict since " From b120d7f774fe519bc175baf37ab50f85cfc58e58 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Sun, 31 May 2020 17:07:10 -0500 Subject: [PATCH 14/32] fix spacing issue --- python/cuml/ensemble/randomforestclassifier.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 6266ab57b5..9a20b7da06 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -594,7 +594,6 @@ class RandomForestClassifier(BaseRandomForestModel): " should be equal to the number of classes" " present in the training dataset.") - elif predict_model == "CPU" or self.num_classes > 2: if self.num_classes > 2 and predict_model == "GPU": warnings.warn("Switching over to use the CPU predict since " From 81a34f3c5d2bd71b38283263a18e6388ff790e6e Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Tue, 2 Jun 2020 16:52:01 -0500 Subject: [PATCH 15/32] update rf common --- python/cuml/ensemble/randomforest_common.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index af5166973f..0385ac42a7 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -109,7 +109,6 @@ class BaseRandomForestModel(Base): self.max_depth = max_depth self.max_features = max_features self.bootstrap = bootstrap - self.verbose = verbose self.n_bins = n_bins self.n_cols = None self.dtype = dtype From 494cdf69ad8b90ed866aa6e4e0c2363a1a50fa39 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Tue, 2 Jun 2020 17:38:25 -0500 Subject: [PATCH 16/32] remove rf common py file --- python/cuml/ensemble/randomforest_common.py | 94 --------------------- 1 file changed, 94 deletions(-) delete mode 100644 python/cuml/ensemble/randomforest_common.py diff --git a/python/cuml/ensemble/randomforest_common.py b/python/cuml/ensemble/randomforest_common.py deleted file mode 100644 index 5b3ecd89b7..0000000000 --- a/python/cuml/ensemble/randomforest_common.py +++ /dev/null @@ -1,94 +0,0 @@ -# -# Copyright (c) 2020, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -from cuml import ForestInference -from cuml.fil.fil import TreeliteModel as tl - - -def _check_fil_parameter_validity(depth, algo, fil_sparse_format): - storage_format = _check_fil_sparse_format_value(fil_sparse_format) - if (depth > 16 and (storage_format == 'dense' or - algo == 'tree_reorg' or - algo == 'batch_tree_reorg')): - raise ValueError("While creating a forest with max_depth greater " - "than 16, `fil_sparse_format` should be True. " - "If `fil_sparse_format=False` then the memory" - "consumed while creating the FIL forest is very " - "large and the process will be aborted. In " - "addition, `algo` must be either set to `naive' " - "or `auto` to set 'fil_sparse_format=True`.") - return storage_format - - -def _check_fil_sparse_format_value(fil_sparse_format): - accepted_vals = [True, False, 'auto'] - if fil_sparse_format == 'auto': - storage_format = fil_sparse_format - elif not fil_sparse_format: - storage_format = 'dense' - elif fil_sparse_format not in accepted_vals: - raise ValueError("The value entered for spares_forest is not " - "supported. Please refer to the documentation " - "to see the accepted values.") - else: - storage_format = 'sparse' - - return storage_format - - -def _obtain_treelite_model(treelite_handle): - """ - Creates a Treelite model using the treelite handle - obtained from the cuML Random Forest model. - - Returns - ---------- - tl_to_fil_model : Treelite version of this model - """ - treelite_model = \ - tl.from_treelite_model_handle(treelite_handle) - return treelite_model - - -def _obtain_fil_model(treelite_handle, depth, - output_class=True, - threshold=0.5, algo='auto', - fil_sparse_format='auto'): - """ - Creates a Forest Inference (FIL) model using the treelite - handle obtained from the cuML Random Forest model. - - Returns - ---------- - fil_model : - A Forest Inference model which can be used to perform - inferencing on the random forest model. - """ - - storage_format = \ - _check_fil_parameter_validity(depth=depth, - fil_sparse_format=fil_sparse_format, - algo=algo) - - fil_model = ForestInference() - tl_to_fil_model = \ - fil_model.load_using_treelite_handle(treelite_handle, - output_class=output_class, - threshold=threshold, - algo=algo, - storage_type=storage_format) - - return tl_to_fil_model From 8f77cdf2623350dae2bf9dc052c60b4af1b6126d Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 3 Jun 2020 06:49:18 -0500 Subject: [PATCH 17/32] update docs --- python/cuml/ensemble/randomforest_common.pyx | 1 + python/cuml/ensemble/randomforestclassifier.pyx | 1 - python/cuml/ensemble/randomforestregressor.pyx | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 0385ac42a7..97bbfe77a2 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -353,6 +353,7 @@ class BaseRandomForestModel(Base): return params def _set_params(self, **params): + # Resetting handle as __setstate__ overwrites with handle=None self.handle.__setstate__(self.n_streams) self.model_pbuf_bytes = [] diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 9a20b7da06..1ec8179b92 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -901,7 +901,6 @@ class RandomForestClassifier(BaseRandomForestModel): ----------- params : dict of new params """ - # Resetting handle as __setstate__ overwrites with handle=None return self._set_params(**params) def print_summary(self): diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 92f4522ab4..9eb67631c8 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -678,7 +678,6 @@ class RandomForestRegressor(BaseRandomForestModel): ----------- params : dict of new params """ - # Resetting handle as __setstate__ overwrites with handle=None return self._set_params(**params) def print_summary(self): From 5866f1797c0a99a310f2c1930ef8ef07ab1e8c1f Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 10 Jun 2020 15:18:31 -0500 Subject: [PATCH 18/32] update _create_model func to __init__ --- python/cuml/ensemble/randomforest_common.pyx | 35 +++++++++++-------- .../cuml/ensemble/randomforestclassifier.pyx | 22 +++++------- .../cuml/ensemble/randomforestregressor.pyx | 13 +++---- 3 files changed, 37 insertions(+), 33 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 97bbfe77a2..d0c7ef02f0 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -43,20 +43,20 @@ class BaseRandomForestModel(Base): 'verbose', 'rows_sample', 'max_leaves', 'quantile_per_tree'] - def _create_model(self, seed, split_criterion, - n_streams, n_estimators=100, - max_depth=16, handle=None, max_features='auto', - n_bins=8, split_algo=1, bootstrap=True, - bootstrap_features=False, - verbose=False, min_rows_per_node=2, - rows_sample=1.0, max_leaves=-1, - accuracy_metric=None, dtype=None, - output_type=None, min_samples_leaf=None, - min_weight_fraction_leaf=None, n_jobs=None, - max_leaf_nodes=None, min_impurity_decrease=0.0, - min_impurity_split=None, oob_score=None, - random_state=None, warm_start=None, class_weight=None, - quantile_per_tree=False, criterion=None): + def __init__(self, split_criterion, seed=None, + n_streams=8, n_estimators=100, + max_depth=16, handle=None, max_features='auto', + n_bins=8, split_algo=1, bootstrap=True, + bootstrap_features=False, + verbose=False, min_rows_per_node=2, + rows_sample=1.0, max_leaves=-1, + accuracy_metric=None, dtype=None, + output_type=None, min_samples_leaf=None, + min_weight_fraction_leaf=None, n_jobs=None, + max_leaf_nodes=None, min_impurity_decrease=0.0, + min_impurity_split=None, oob_score=None, + random_state=None, warm_start=None, class_weight=None, + quantile_per_tree=False, criterion=None): if accuracy_metric: BaseRandomForestModel.variables.append('accuracy_metric') @@ -78,6 +78,13 @@ class BaseRandomForestModel(Base): " please read the cuML documentation for" " more information") + if ((seed is not None) and (n_streams != 1)): + warnings.warn("For reproducible results in Random Forest" + " Classifier and for almost reproducible results" + " in Random Forest Regressor, n_streams==1 is " + "recommended. If n_streams is > 1, results may vary " + "due to stream/thread timing differences, even when " + "random_seed is set") if handle is None: handle = Handle(n_streams) diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 1ec8179b92..12f13f0af3 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -224,19 +224,13 @@ class RandomForestClassifier(BaseRandomForestModel): Seed for the random number generator. Unseeded by default. """ - def __init__(self, split_criterion=0, seed=None, - n_streams=8, **kwargs): - if ((seed is not None) and (n_streams != 1)): - warnings.warn("For reproducible results, n_streams==1 is " - "recommended. If n_streams is > 1, results may vary " - "due to stream/thread timing differences, even when " - "random_seed is set") + def __init__(self, split_criterion=0, + **kwargs): self.RF_type = CLASSIFICATION self.num_classes = 2 - super(RandomForestClassifier, self)._create_model( + super(RandomForestClassifier, self).__init__( split_criterion=split_criterion, - seed=seed, n_streams=n_streams, **kwargs) """ @@ -248,7 +242,6 @@ class RandomForestClassifier(BaseRandomForestModel): """ def __getstate__(self): state = self.__dict__.copy() - del state['handle'] cdef size_t params_t cdef RandomForestMetaData[float, int] *rf_forest cdef RandomForestMetaData[double, int] *rf_forest64 @@ -268,16 +261,19 @@ class RandomForestClassifier(BaseRandomForestModel): params_t64 state["rf_params64"] = rf_forest64.rf_params - state['n_cols'] = self.n_cols + state["n_cols"] = self.n_cols state["verbose"] = self.verbose state["model_pbuf_bytes"] = self.model_pbuf_bytes state["treelite_handle"] = None - + state["split_criterion"] = self.split_criterion + state["handle"] = self.handle return state def __setstate__(self, state): super(RandomForestClassifier, self).__init__( - handle=None, verbose=state['verbose']) + split_criterion=state["split_criterion"], + handle=state["handle"], + verbose=state["verbose"]) cdef RandomForestMetaData[float, int] *rf_forest = \ new RandomForestMetaData[float, int]() cdef RandomForestMetaData[double, int] *rf_forest64 = \ diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 9eb67631c8..eead99e24f 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -212,13 +212,12 @@ class RandomForestRegressor(BaseRandomForestModel): """ - def __init__(self, split_criterion=2, seed=None, - accuracy_metric='mse', n_streams=8, + def __init__(self, split_criterion=2, + accuracy_metric='mse', **kwargs): self.RF_type = REGRESSION - super(RandomForestRegressor, self)._create_model( + super(RandomForestRegressor, self).__init__( split_criterion=split_criterion, - seed=seed, n_streams=n_streams, accuracy_metric=accuracy_metric, **kwargs) """ @@ -228,7 +227,6 @@ class RandomForestRegressor(BaseRandomForestModel): """ def __getstate__(self): state = self.__dict__.copy() - del state['handle'] cdef size_t params_t cdef RandomForestMetaData[float, float] *rf_forest cdef RandomForestMetaData[double, double] *rf_forest64 @@ -252,12 +250,15 @@ class RandomForestRegressor(BaseRandomForestModel): state["verbose"] = self.verbose state["model_pbuf_bytes"] = self.model_pbuf_bytes state["treelite_handle"] = None + state["split_criterion"] = self.split_criterion + state["handle"] = self.handle return state def __setstate__(self, state): super(RandomForestRegressor, self).__init__( - handle=None, verbose=state['verbose']) + split_criterion=state["split_criterion"], + handle=state["handle"], verbose=state['verbose']) cdef RandomForestMetaData[float, float] *rf_forest = \ new RandomForestMetaData[float, float]() cdef RandomForestMetaData[double, double] *rf_forest64 = \ From 3bfe631a8f6effcd84384a8d8a3129ddfc28870f Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 10 Jun 2020 15:22:08 -0500 Subject: [PATCH 19/32] update n_streams warning --- python/cuml/ensemble/randomforest_common.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index d0c7ef02f0..e474b9fb87 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -80,7 +80,7 @@ class BaseRandomForestModel(Base): if ((seed is not None) and (n_streams != 1)): warnings.warn("For reproducible results in Random Forest" - " Classifier and for almost reproducible results" + " Classifier or for almost reproducible results" " in Random Forest Regressor, n_streams==1 is " "recommended. If n_streams is > 1, results may vary " "due to stream/thread timing differences, even when " From 2a31c0c8eb313addf9654642610228bd2bfa53d1 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 17 Jun 2020 09:51:37 -0500 Subject: [PATCH 20/32] update code based on reviews --- python/cuml/ensemble/__init__.py | 2 +- python/cuml/ensemble/randomforest_common.pyx | 111 ++++++++++++------ .../cuml/ensemble/randomforestclassifier.pyx | 4 +- .../cuml/ensemble/randomforestregressor.pyx | 6 +- python/cuml/fil/fil.pyx | 18 +-- 5 files changed, 90 insertions(+), 51 deletions(-) diff --git a/python/cuml/ensemble/__init__.py b/python/cuml/ensemble/__init__.py index 1bb48ec807..7cd2567acf 100644 --- a/python/cuml/ensemble/__init__.py +++ b/python/cuml/ensemble/__init__.py @@ -18,4 +18,4 @@ from cuml.ensemble.randomforestclassifier import RandomForestClassifier from cuml.ensemble.randomforestregressor import RandomForestRegressor from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ - _check_fil_sparse_format_value, _obtain_treelite_model, _obtain_fil_model + _obtain_treelite_model, _obtain_fil_model diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index e474b9fb87..6af8d16d35 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -42,6 +42,8 @@ class BaseRandomForestModel(Base): 'bootstrap', 'bootstrap_features', 'verbose', 'rows_sample', 'max_leaves', 'quantile_per_tree'] + criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, + '3': MAE, '4': CRITERION_END} def __init__(self, split_criterion, seed=None, n_streams=8, n_estimators=100, @@ -73,10 +75,12 @@ class BaseRandomForestModel(Base): for key, vals in sklearn_params.items(): if vals: - raise TypeError(" The Scikit-learn variable ", key, - " is not supported in cuML," - " please read the cuML documentation for" - " more information") + raise TypeError( + " The Scikit-learn variable ", key, + " is not supported in cuML," + " please read the cuML documentation at " + "(https://docs.rapids.ai/api/cuml/nightly/" + "api.html#random-forest) for more information") if ((seed is not None) and (n_streams != 1)): warnings.warn("For reproducible results in Random Forest" @@ -97,15 +101,15 @@ class BaseRandomForestModel(Base): raise ValueError("Must specify max_depth >0 ") self.split_algo = split_algo - criterion_dict = {'0': GINI, '1': ENTROPY, '2': MSE, - '3': MAE, '4': CRITERION_END} - if str(split_criterion) not in criterion_dict.keys(): + if (str(split_criterion) not in + BaseRandomForestModel.criterion_dict.keys()): warnings.warn("The split criterion chosen was not present" " in the list of options accepted by the model" " and so the CRITERION_END option has been chosen.") self.split_criterion = CRITERION_END else: - self.split_criterion = criterion_dict[str(split_criterion)] + self.split_criterion = \ + BaseRandomForestModel.criterion_dict[str(split_criterion)] self.min_rows_per_node = min_rows_per_node self.min_impurity_decrease = min_impurity_decrease @@ -143,27 +147,33 @@ class BaseRandomForestModel(Base): else: return 1.0 else: - raise ValueError("Wrong value passed in for max_features" - " please read the documentation") + raise ValueError( + "Wrong value passed in for max_features" + " please read the documentation present at " + "(https://docs.rapids.ai/api/cuml/nightly/api.html" + "#random-forest)") def _get_protobuf_bytes(self): """ Returns the self.model_pbuf_bytes. Cuml RF model gets converted to treelite protobuf bytes by: + 1. converting the cuml RF model to a treelite model. The treelite models handle (pointer) is returned + 2. The treelite model handle is used to convert the treelite model to a treelite protobuf model which is stored in a temporary file. The protobuf model information is read from the temporary file and the byte information is returned. + The treelite handle is stored `self.treelite_handle` and the treelite protobuf model bytes are stored in `self.model_pbuf_bytes`. If either of information is already present in the model then the respective step is skipped. """ if self.dtype == np.float64: - raise TypeError("To use pickling, first train the model" - " using float 32 data.") + raise TypeError("Pickling is only supported on models trained" + " on float32 data.") if self.model_pbuf_bytes: return self.model_pbuf_bytes elif self.treelite_handle: @@ -197,7 +207,7 @@ class BaseRandomForestModel(Base): raise NotImplementedError("Pickling for multi-class " "classification models is currently" " not implemented. Please check" - " cuml issue #1679 for more" + " cuml GitHub issue #1679 for more" " information.") build_treelite_forest( & cuml_model_ptr, @@ -239,7 +249,7 @@ class BaseRandomForestModel(Base): check_rows=self.n_rows, check_cols=1) if y_dtype != np.int32: raise TypeError("The labels `y` need to be of dtype" - " `np.int32`") + " `int32`") unique_labels = rmm_cupy_ary(cp.unique, y_m) self.num_classes = len(unique_labels) for i in range(self.num_classes): @@ -271,14 +281,14 @@ class BaseRandomForestModel(Base): cdef ModelHandle cuml_model_ptr = NULL if self.RF_type == CLASSIFICATION: build_treelite_forest( - & cuml_model_ptr, + &cuml_model_ptr, self.rf_forest, self.n_cols, self.num_classes, model_bytes) else: build_treelite_forest( - & cuml_model_ptr, + &cuml_model_ptr, self.rf_forest, self.n_cols, REGRESSION_MODEL, @@ -292,14 +302,18 @@ class BaseRandomForestModel(Base): cdef vector[ModelHandle] *model_handles \ = new vector[ModelHandle]() cdef uintptr_t mod_ptr + for i in treelite_handle: mod_ptr = i model_handles.push_back(( mod_ptr)) + self._reset_forest_data() + concat_model_handle = concatenate_trees(deref(model_handles)) cdef uintptr_t concat_model_ptr = concat_model_handle self.treelite_handle = concat_model_ptr + cdef vector[unsigned char] pbuf_mod_info = \ save_model( concat_model_ptr) cdef unsigned char[::1] pbuf_mod_view = \ @@ -327,8 +341,8 @@ class BaseRandomForestModel(Base): raise TypeError("GPU based predict only accepts np.float32 data. \ Please set convert_dtype=True to convert the test \ data to the same dtype as the data used to train, \ - ie. np.float32. If you would like to use test \ - data of dtype=np.float64 please set \ + ie. float32. If you would like to use test \ + data of dtype=float64 please set \ predict_model='CPU' to use the CPU implementation \ of predict.") @@ -360,8 +374,6 @@ class BaseRandomForestModel(Base): return params def _set_params(self, **params): - # Resetting handle as __setstate__ overwrites with handle=None - self.handle.__setstate__(self.n_streams) self.model_pbuf_bytes = [] if not params: @@ -375,8 +387,40 @@ class BaseRandomForestModel(Base): def _check_fil_parameter_validity(depth, algo, fil_sparse_format): - storage_format = _check_fil_sparse_format_value(fil_sparse_format) - if (depth > 16 and (storage_format == 'dense' or + """ + Check if the FIL storage format type passed by the user is right + for the trained cuml Random Forest model they have. + + Parameters + ---------- + depth : max depth value used to train model + algo : string (default = 'auto') + This is optional and required only while performing the + predict operation on the GPU. + 'naive' - simple inference using shared memory + 'tree_reorg' - similar to naive but trees rearranged to be more + coalescing-friendly + 'batch_tree_reorg' - similar to tree_reorg but predicting + multiple rows per thread block + `auto` - choose the algorithm automatically. Currently + 'batch_tree_reorg' is used for dense storage + and 'naive' for sparse storage + fil_sparse_format : boolean or string (default = 'auto') + This variable is used to choose the type of forest that will be + created in the Forest Inference Library. It is not required + while using predict_model='CPU'. + 'auto' - choose the storage type automatically + (currently True is chosen by auto) + False - create a dense forest + True - create a sparse forest, requires algo='naive' + or algo='auto' + Returns + ---------- + fil_sparse_format converted to string + """ + accepted_fil_spars_format = {True, False, 'auto'} + + if (depth > 16 and (fil_sparse_format is False or algo == 'tree_reorg' or algo == 'batch_tree_reorg')): raise ValueError("While creating a forest with max_depth greater " @@ -386,22 +430,13 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): "large and the process will be aborted. In " "addition, `algo` must be either set to `naive' " "or `auto` to set 'fil_sparse_format=True`.") - return storage_format - - -def _check_fil_sparse_format_value(fil_sparse_format): - accepted_vals = [True, False, 'auto'] - if fil_sparse_format == 'auto': - storage_format = fil_sparse_format - elif not fil_sparse_format: - storage_format = 'dense' - elif fil_sparse_format not in accepted_vals: - raise ValueError("The value entered for spares_forest is not " - "supported. Please refer to the documentation " - "to see the accepted values.") - else: - storage_format = 'sparse' - return storage_format + if fil_sparse_format not in accepted_fil_spars_format: + raise ValueError( + "The value entered for spares_forest is not " + "supported. Please refer to the documentation at " + "(https://docs.rapids.ai/api/cuml/nightly/api.html" + "#forest-inferencing) to see the accepted values.") + return str(fil_sparse_format) def _obtain_treelite_model(treelite_handle): diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 12f13f0af3..999b73a5d3 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -400,8 +400,8 @@ class RandomForestClassifier(BaseRandomForestModel): These labels should be contiguous integers from 0 to n_classes. convert_dtype : bool, optional (default = False) When set to True, the fit method will, when necessary, convert - y to be the same data type as X if they differ. This will increase - memory used for the method. + y to be of dtype int32. This will increase memory used for + the method. """ diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index eead99e24f..0c9f58549c 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -334,7 +334,7 @@ class RandomForestRegressor(BaseRandomForestModel): `auto` - choose the algorithm automatically. Currently 'batch_tree_reorg' is used for dense storage and 'naive' for sparse storage - fil_sparse_format : boolean or string (default = auto) + fil_sparse_format : boolean or string (default = 'auto') This variable is used to choose the type of forest that will be created in the Forest Inference Library. It is not required while using predict_model='CPU'. @@ -376,6 +376,10 @@ class RandomForestRegressor(BaseRandomForestModel): Acceptable formats: NumPy ndarray, Numba device ndarray, cuda array interface compliant array like CuPy These labels should be contiguous integers from 0 to n_classes. + convert_dtype : bool, optional (default = False) + When set to True, the fit method will, when necessary, convert + y to be the same data type as X if they differ. This will increase + memory used for the method. """ X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, convert_dtype) diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 68e7351bb7..2e239cc462 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -226,15 +226,16 @@ cdef class ForestInference_impl(): return algo_dict[algo_str] def get_storage_type(self, storage_type_str): - storage_type_dict={'AUTO': storage_type_t.AUTO, - 'auto': storage_type_t.AUTO, - 'DENSE': storage_type_t.DENSE, - 'dense': storage_type_t.DENSE, - 'SPARSE': storage_type_t.SPARSE, - 'sparse': storage_type_t.SPARSE} + storage_type_dict={'auto': storage_type_t.AUTO, + 'False': storage_type_t.DENSE, + 'True': storage_type_t.SPARSE} + if storage_type_str not in storage_type_dict.keys(): - raise ValueError(' Wrong sparsity selected please refer' - ' to the documentation') + raise ValueError( + "The value entered for spares_forest is not " + "supported. Please refer to the documentation at" + "(https://docs.rapids.ai/api/cuml/nightly/api.html#" + "forest-inferencing) to see the accepted values.") return storage_type_dict[storage_type_str] def predict(self, X, output_type='numpy', predict_proba=False, preds=None): @@ -336,7 +337,6 @@ cdef class ForestInference_impl(): treelite_params.threshold = threshold treelite_params.algo = self.get_algo(algo) treelite_params.storage_type = self.get_storage_type(storage_type) - cdef cumlHandle* handle_ =\ self.handle.getHandle() cdef uintptr_t model_ptr = model_handle From acdc3814e05824f56489eb45adc1a4b0b70482d1 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 17 Jun 2020 11:04:05 -0500 Subject: [PATCH 21/32] fix style errors --- python/cuml/ensemble/randomforest_common.pyx | 8 +++--- python/cuml/fil/fil.pyx | 26 ++++++++++---------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 6af8d16d35..8b8e54f56c 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -210,14 +210,14 @@ class BaseRandomForestModel(Base): " cuml GitHub issue #1679 for more" " information.") build_treelite_forest( - & cuml_model_ptr, + &cuml_model_ptr, self.rf_forest, self.n_cols, self.num_classes, model_pbuf_vec) else: build_treelite_forest( - & cuml_model_ptr, + &cuml_model_ptr, self.rf_forest, self.n_cols, REGRESSION_MODEL, @@ -416,7 +416,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): or algo='auto' Returns ---------- - fil_sparse_format converted to string + fil_sparse_format """ accepted_fil_spars_format = {True, False, 'auto'} @@ -436,7 +436,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): "supported. Please refer to the documentation at " "(https://docs.rapids.ai/api/cuml/nightly/api.html" "#forest-inferencing) to see the accepted values.") - return str(fil_sparse_format) + return fil_sparse_format def _obtain_treelite_model(treelite_handle): diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 2e239cc462..69d078164d 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -227,12 +227,12 @@ cdef class ForestInference_impl(): def get_storage_type(self, storage_type_str): storage_type_dict={'auto': storage_type_t.AUTO, - 'False': storage_type_t.DENSE, - 'True': storage_type_t.SPARSE} + False: storage_type_t.DENSE, + True: storage_type_t.SPARSE} if storage_type_str not in storage_type_dict.keys(): raise ValueError( - "The value entered for spares_forest is not " + "The value entered for storage_type is not " "supported. Please refer to the documentation at" "(https://docs.rapids.ai/api/cuml/nightly/api.html#" "forest-inferencing) to see the accepted values.") @@ -508,11 +508,11 @@ class ForestInference(Base): only if output_class == True, else it is ignored. storage_type : string (default='auto') In-memory storage format to be used for the FIL model. - 'AUTO' or 'auto' - choose the storage type automatically - (currently DENSE is always used) - 'DENSE' or 'dense' - create a dense forest - 'SPARSE' or 'sparse' - create a sparse forest; - requires algo='NAIVE' or algo='AUTO' + 'auto' - choose the storage type automatically + (currently DENSE is always used) + 'False' - create a dense forest + 'True' - create a sparse forest; + requires algo='NAIVE' or algo='AUTO' Returns ---------- @@ -563,11 +563,11 @@ class ForestInference(Base): only if output_class == True, else it is ignored. storage_type : string (default='auto') In-memory storage format to be used for the FIL model. - 'AUTO' or 'auto' - choose the storage type automatically - (currently DENSE is always used) - 'DENSE' or 'dense' - create a dense forest - 'SPARSE' or 'sparse' - create a sparse forest; - requires algo='NAIVE' or algo='AUTO'. + 'auto' - choose the storage type automatically + (currently DENSE is always used) + 'False' - create a dense forest + 'True' - create a sparse forest; + requires algo='NAIVE' or algo='AUTO' Returns ---------- From 4ed2fb7cc3306edcbc1b1085ae390cab9b88833c Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 17 Jun 2020 11:11:52 -0500 Subject: [PATCH 22/32] remove obtain_treelite_model func from common --- python/cuml/ensemble/randomforest_common.pyx | 16 +--------------- python/cuml/ensemble/randomforestclassifier.pyx | 4 ++-- python/cuml/ensemble/randomforestregressor.pyx | 5 ++--- 3 files changed, 5 insertions(+), 20 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 8b8e54f56c..a24e24365c 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -338,7 +338,7 @@ class BaseRandomForestModel(Base): check_cols=self.n_cols) if dtype == np.float64 and not convert_dtype: - raise TypeError("GPU based predict only accepts np.float32 data. \ + raise TypeError("GPU based predict only accepts float32 data. \ Please set convert_dtype=True to convert the test \ data to the same dtype as the data used to train, \ ie. float32. If you would like to use test \ @@ -439,20 +439,6 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): return fil_sparse_format -def _obtain_treelite_model(treelite_handle): - """ - Creates a Treelite model using the treelite handle - obtained from the cuML Random Forest model. - - Returns - ---------- - tl_to_fil_model : Treelite version of this model - """ - treelite_model = \ - TreeliteModel.from_treelite_model_handle(treelite_handle) - return treelite_model - - def _obtain_fil_model(treelite_handle, depth, output_class=True, threshold=0.5, algo='auto', diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 999b73a5d3..4343d32a5d 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -321,8 +321,8 @@ class RandomForestClassifier(BaseRandomForestModel): ---------- tl_to_fil_model : Treelite version of this model """ - handle = self._obtain_treelite_handle() - return _obtain_treelite_model(handle) + treelite_handle = self._obtain_treelite_handle() + return TreeliteModel.from_treelite_model_handle(treelite_handle) def convert_to_fil_model(self, output_class=True, threshold=0.5, algo='auto', diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 6a270cde53..b8c34b4ac5 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -249,7 +249,6 @@ class RandomForestRegressor(BaseRandomForestModel): state['n_cols'] = self.n_cols state["verbose"] = self.verbose state["model_pbuf_bytes"] = self.model_pbuf_bytes - state['handle'] = self.handle state["treelite_handle"] = None state["split_criterion"] = self.split_criterion state["handle"] = self.handle @@ -307,8 +306,8 @@ class RandomForestRegressor(BaseRandomForestModel): ---------- tl_to_fil_model : Treelite version of this model """ - handle = self._obtain_treelite_handle() - return _obtain_treelite_model(handle) + treelite_handle = self._obtain_treelite_handle() + return TreeliteModel.from_treelite_model_handle(treelite_handle) def convert_to_fil_model(self, output_class=False, algo='auto', From f46a9892f7af60bcd125d7c2d192baf16d344231 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 17 Jun 2020 12:27:16 -0500 Subject: [PATCH 23/32] update rf and fil to accept storage type as bool or 'auto' --- python/cuml/ensemble/__init__.py | 2 +- python/cuml/ensemble/randomforest_common.pyx | 19 ++++++++++--------- .../cuml/ensemble/randomforestclassifier.pyx | 3 +-- .../cuml/ensemble/randomforestregressor.pyx | 3 +-- python/cuml/fil/fil.pyx | 4 ++-- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/python/cuml/ensemble/__init__.py b/python/cuml/ensemble/__init__.py index 7cd2567acf..d42ae676b6 100644 --- a/python/cuml/ensemble/__init__.py +++ b/python/cuml/ensemble/__init__.py @@ -18,4 +18,4 @@ from cuml.ensemble.randomforestclassifier import RandomForestClassifier from cuml.ensemble.randomforestregressor import RandomForestRegressor from cuml.ensemble.randomforest_common import _check_fil_parameter_validity, \ - _obtain_treelite_model, _obtain_fil_model + _obtain_fil_model diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index a24e24365c..3f89b6aec1 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -266,15 +266,13 @@ class BaseRandomForestModel(Base): check_rows=self.n_rows, check_cols=1) if self.dtype == np.float64: - warnings.warn("To use GPU-based prediction, first train using \ - float 32 data to fit the estimator.") + warnings.warn("To use pickling or GPU-based prediction first " + "train using float32 data to fit the estimator") max_feature_val = self._get_max_feat_val() if type(self.min_rows_per_node) == float: self.min_rows_per_node = \ math.ceil(self.min_rows_per_node*self.n_rows) - del X - del y return X_m, y_m, max_feature_val def _tl_model_handles(self, model_bytes): @@ -334,10 +332,13 @@ class BaseRandomForestModel(Base): out_type = self._get_output_type(X) cdef ModelHandle cuml_model_ptr = NULL _, n_rows, n_cols, dtype = \ - input_to_cuml_array(X, order='F', - check_cols=self.n_cols) + input_to_cuml_array( + X, order='F', + check_cols=self.n_cols, + convert_to_dtype=(self.dtype if convert_dtype + else None)) - if dtype == np.float64 and not convert_dtype: + if dtype == np.float64: raise TypeError("GPU based predict only accepts float32 data. \ Please set convert_dtype=True to convert the test \ data to the same dtype as the data used to train, \ @@ -416,7 +417,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): or algo='auto' Returns ---------- - fil_sparse_format + fil_sparse_format as a string """ accepted_fil_spars_format = {True, False, 'auto'} @@ -436,7 +437,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): "supported. Please refer to the documentation at " "(https://docs.rapids.ai/api/cuml/nightly/api.html" "#forest-inferencing) to see the accepted values.") - return fil_sparse_format + return str(fil_sparse_format) def _obtain_fil_model(treelite_handle, depth, diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index 4343d32a5d..420482fd10 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -33,8 +33,7 @@ from cuml.common.handle import Handle from cuml.common import input_to_cuml_array, rmm_cupy_ary from cuml.ensemble.randomforest_common import BaseRandomForestModel -from cuml.ensemble.randomforest_common import _obtain_treelite_model, \ - _obtain_fil_model +from cuml.ensemble.randomforest_common import _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index b8c34b4ac5..1019374381 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -32,8 +32,7 @@ from cuml.common.handle import Handle from cuml.common import input_to_cuml_array, rmm_cupy_ary from cuml.ensemble.randomforest_common import BaseRandomForestModel -from cuml.ensemble.randomforest_common import _obtain_treelite_model, \ - _obtain_fil_model +from cuml.ensemble.randomforest_common import _obtain_fil_model from cuml.ensemble.randomforest_shared cimport * from cuml.fil.fil import TreeliteModel diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 69d078164d..1901d3e61f 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -227,8 +227,8 @@ cdef class ForestInference_impl(): def get_storage_type(self, storage_type_str): storage_type_dict={'auto': storage_type_t.AUTO, - False: storage_type_t.DENSE, - True: storage_type_t.SPARSE} + 'False': storage_type_t.DENSE, + 'True': storage_type_t.SPARSE} if storage_type_str not in storage_type_dict.keys(): raise ValueError( From a5fecceb60293a2a21ad505d7fdbb70245655531 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Wed, 17 Jun 2020 12:38:26 -0500 Subject: [PATCH 24/32] update docs --- python/cuml/ensemble/randomforest_common.pyx | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 3f89b6aec1..6afeec2c92 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -158,10 +158,9 @@ class BaseRandomForestModel(Base): Returns the self.model_pbuf_bytes. Cuml RF model gets converted to treelite protobuf bytes by: - 1. converting the cuml RF model to a treelite model. The treelite + * Converting the cuml RF model to a treelite model. The treelite models handle (pointer) is returned - - 2. The treelite model handle is used to convert the treelite model + * The treelite model handle is used to convert the treelite model to a treelite protobuf model which is stored in a temporary file. The protobuf model information is read from the temporary file and the byte information is returned. From aee0037e8598d6f19f809fe2968430926e6f1f07 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 19 Jun 2020 11:26:48 -0500 Subject: [PATCH 25/32] update rf code --- python/cuml/dask/ensemble/base.py | 5 +- .../dask/ensemble/randomforestclassifier.py | 20 ++----- .../dask/ensemble/randomforestregressor.py | 17 ++---- python/cuml/ensemble/randomforest_common.pyx | 56 +++++++++---------- 4 files changed, 43 insertions(+), 55 deletions(-) diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py index fe94321db0..e4b3dc9e75 100644 --- a/python/cuml/dask/ensemble/base.py +++ b/python/cuml/dask/ensemble/base.py @@ -15,6 +15,7 @@ import dask import math +from dask.distributed import wait from cuml.dask.common.input_utils import DistributedDataHandler, \ concatenate @@ -124,10 +125,12 @@ def _concat_treelite_models(self): model._concatenate_treelite_handle(all_tl_mod_handles) for tl_handle in all_tl_mod_handles: TreeliteModel.free_treelite_model(tl_handle) - + wait(model) return model def _predict_using_fil(self, X, delayed, **kwargs): + if self.local_model is None: + self.local_model = self._concat_treelite_models() data = DistributedDataHandler.create(X, client=self.client) self.datatype = data.datatype return self._predict(X, delayed=delayed, **kwargs) diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 27973f219e..422029b277 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -295,23 +295,15 @@ def predict(self, X, output_class=True, algo='auto', threshold=0.5, else: preds = \ - self.predict_using_fil(X, output_class=output_class, - algo=algo, - threshold=threshold, - convert_dtype=convert_dtype, - predict_model="GPU", - fil_sparse_format=fil_sparse_format, - delayed=delayed) + self._predict_using_fil(X, output_class=output_class, + algo=algo, + threshold=threshold, + convert_dtype=convert_dtype, + fil_sparse_format=fil_sparse_format, + delayed=delayed) return preds - def predict_using_fil(self, X, delayed, **kwargs): - if self.local_model is None: - self.local_model = self._concat_treelite_models() - - return self._predict_using_fil(X=X, - delayed=delayed, - **kwargs) """ TODO : Update function names used for CPU predict. Cuml issue #1854 has been created to track this. diff --git a/python/cuml/dask/ensemble/randomforestregressor.py b/python/cuml/dask/ensemble/randomforestregressor.py index dd98660f0c..09b204a455 100755 --- a/python/cuml/dask/ensemble/randomforestregressor.py +++ b/python/cuml/dask/ensemble/randomforestregressor.py @@ -270,20 +270,13 @@ def predict(self, X, predict_model="GPU", algo='auto', else: preds = \ - self.predict_using_fil(X, predict_model=predict_model, - algo=algo, - convert_dtype=convert_dtype, - fil_sparse_format=fil_sparse_format, - delayed=delayed) + self._predict_using_fil(X, + algo=algo, + convert_dtype=convert_dtype, + fil_sparse_format=fil_sparse_format, + delayed=delayed) return preds - def predict_using_fil(self, X, delayed, **kwargs): - if self.local_model is None: - self.local_model = self._concat_treelite_models() - return self._predict_using_fil(X=X, - delayed=delayed, - **kwargs) - """ TODO : Update function names used for CPU predict. Cuml issue #1854 has been created to track this. diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index d9a313e65d..06b6817a1b 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -184,37 +184,38 @@ class BaseRandomForestModel(Base): return self.treelite_serialized_model def _obtain_treelite_handle(self): + assert self.treelite_serialized_model or self.rf_forest, \ + "Attempting to create treelite from un-fit forest." + + cdef ModelHandle tl_handle = NULL if self.treelite_handle: return self.treelite_handle # Use cached version - cdef ModelHandle tl_handle = NULL - if self.treelite_serialized_model: # bytes -> Treelite + elif self.treelite_serialized_model: # bytes -> Treelite tl_handle = treelite_deserialize( self.treelite_serialized_model) - assert self.treelite_serialized_model or self.rf_forest, \ - "Attempting to create treelite from un-fit forest." - - if self.RF_type == CLASSIFICATION: - if self.num_classes > 2: - raise NotImplementedError("Pickling for multi-class " - "classification models is currently" - " not implemented. Please check" - " cuml GitHub issue #1679 for more" - " information.") - build_treelite_forest( - &tl_handle, - self.rf_forest, - self.n_cols, - self.num_classes, - model_pbuf_vec) else: - build_treelite_forest( - &tl_handle, - self.rf_forest, - self.n_cols, - REGRESSION_MODEL, - model_pbuf_vec) + if self.RF_type == CLASSIFICATION: + if self.num_classes > 2: + raise NotImplementedError( + "Pickling for multi-class classification models" + " is currently not implemented. Please check" + " cuml GitHub issue #1679 for more information.") + + build_treelite_forest( + &tl_handle, + + self.rf_forest, + self.n_cols, + self.num_classes) + else: + build_treelite_forest( + &tl_handle, + + self.rf_forest, + self.n_cols, + REGRESSION_MODEL) self.treelite_handle = tl_handle return self.treelite_handle @@ -298,10 +299,9 @@ class BaseRandomForestModel(Base): return self - def _predict_model_on_gpu(self, X, output_class, - threshold, algo, - num_classes, convert_dtype, - fil_sparse_format, predict_proba): + def _predict_model_on_gpu(self, X, algo, convert_dtype, + fil_sparse_format, threshold=0.5, + output_class=False, predict_proba=False): out_type = self._get_output_type(X) _, n_rows, n_cols, dtype = \ input_to_cuml_array(X, order='F', From 845ec9368b61ba127a4e5b9e81be81f0bc1bf099 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 19 Jun 2020 11:52:16 -0500 Subject: [PATCH 26/32] remove debugging code from base --- python/cuml/dask/ensemble/base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py index e4b3dc9e75..0777841c2b 100644 --- a/python/cuml/dask/ensemble/base.py +++ b/python/cuml/dask/ensemble/base.py @@ -15,7 +15,6 @@ import dask import math -from dask.distributed import wait from cuml.dask.common.input_utils import DistributedDataHandler, \ concatenate @@ -125,7 +124,6 @@ def _concat_treelite_models(self): model._concatenate_treelite_handle(all_tl_mod_handles) for tl_handle in all_tl_mod_handles: TreeliteModel.free_treelite_model(tl_handle) - wait(model) return model def _predict_using_fil(self, X, delayed, **kwargs): From 8fdccd4a0284601c312ecbd99ebf2f45dacc829d Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 19 Jun 2020 12:29:16 -0500 Subject: [PATCH 27/32] update fil to accept string and boolean --- python/cuml/ensemble/randomforest_common.pyx | 4 ++-- python/cuml/fil/fil.pyx | 22 ++++++++++---------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 06b6817a1b..f77436c5b2 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -386,7 +386,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): or algo='auto' Returns ---------- - fil_sparse_format as a string + fil_sparse_format """ accepted_fil_spars_format = {True, False, 'auto'} @@ -406,7 +406,7 @@ def _check_fil_parameter_validity(depth, algo, fil_sparse_format): "supported. Please refer to the documentation at " "(https://docs.rapids.ai/api/cuml/nightly/api.html" "#forest-inferencing) to see the accepted values.") - return str(fil_sparse_format) + return fil_sparse_format def _obtain_fil_model(treelite_handle, depth, diff --git a/python/cuml/fil/fil.pyx b/python/cuml/fil/fil.pyx index 1901d3e61f..7e7a92b584 100644 --- a/python/cuml/fil/fil.pyx +++ b/python/cuml/fil/fil.pyx @@ -506,12 +506,12 @@ class ForestInference(Base): threshold : float (default=0.5) Threshold is used to for classification. It is applied only if output_class == True, else it is ignored. - storage_type : string (default='auto') + storage_type : string or boolean (default='auto') In-memory storage format to be used for the FIL model. 'auto' - choose the storage type automatically (currently DENSE is always used) - 'False' - create a dense forest - 'True' - create a sparse forest; + False - create a dense forest + True - create a sparse forest; requires algo='NAIVE' or algo='AUTO' Returns @@ -523,12 +523,12 @@ class ForestInference(Base): if isinstance(model, TreeliteModel): # TreeliteModel defined in this file return self._impl.load_from_treelite_model( - model, output_class, algo, threshold, storage_type) + model, output_class, algo, threshold, str(storage_type)) else: # assume it is treelite.Model return self._impl.load_from_treelite_model_handle( model.handle.value, output_class, algo, threshold, - storage_type) + str(storage_type)) @staticmethod def load_from_sklearn(skl_model, @@ -561,12 +561,12 @@ class ForestInference(Base): threshold : float (default=0.5) Threshold is used to for classification. It is applied only if output_class == True, else it is ignored. - storage_type : string (default='auto') + storage_type : string or boolean (default='auto') In-memory storage format to be used for the FIL model. 'auto' - choose the storage type automatically (currently DENSE is always used) - 'False' - create a dense forest - 'True' - create a sparse forest; + False - create a dense forest + True - create a sparse forest; requires algo='NAIVE' or algo='AUTO' Returns @@ -584,7 +584,7 @@ class ForestInference(Base): tl_model = tl_skl.import_model(skl_model) cuml_fm.load_from_treelite_model( tl_model, algo=algo, output_class=output_class, - storage_type=storage_type, threshold=threshold) + storage_type=str(storage_type), threshold=threshold) return cuml_fm @staticmethod @@ -632,7 +632,7 @@ class ForestInference(Base): cuml_fm.load_from_treelite_model(tl_model, algo=algo, output_class=output_class, - storage_type=storage_type, + storage_type=str(storage_type), threshold=threshold) return cuml_fm @@ -673,4 +673,4 @@ class ForestInference(Base): return self._impl.load_using_treelite_handle(model_handle, output_class, algo, threshold, - storage_type) + str(storage_type)) From 49915f126b5fc4ef997f3b71408f6b5c9576f2d8 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Fri, 19 Jun 2020 13:34:58 -0500 Subject: [PATCH 28/32] update fil tests --- python/cuml/test/test_fil.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/python/cuml/test/test_fil.py b/python/cuml/test/test_fil.py index 281ba71742..856d78ac25 100644 --- a/python/cuml/test/test_fil.py +++ b/python/cuml/test/test_fil.py @@ -190,16 +190,15 @@ def test_fil_regression(n_rows, n_columns, num_rounds, tmp_path, max_depth): @pytest.mark.parametrize('n_columns', [20]) @pytest.mark.parametrize('n_estimators', [1, 10]) @pytest.mark.parametrize('max_depth', [2, 10, 20]) -@pytest.mark.parametrize('storage_type', ['False', 'True']) +@pytest.mark.parametrize('storage_type', [False, True]) @pytest.mark.parametrize('model_class', [GradientBoostingClassifier, RandomForestClassifier]) @pytest.mark.xfail(not check_min_treelite_version(), reason="need to install treelite version 0.90") def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): - # skip depth 20 for dense tests - if max_depth == 20 and storage_type == 'False': + if max_depth == 20 and not storage_type: return # settings @@ -236,7 +235,7 @@ def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, skl_acc = accuracy_score(y_validation, skl_preds > 0.5) - algo = 'NAIVE' if storage_type == 'True' else 'BATCH_TREE_REORG' + algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, @@ -260,7 +259,7 @@ def test_fil_skl_classification(n_rows, n_columns, n_estimators, max_depth, @pytest.mark.parametrize('n_columns', [20]) @pytest.mark.parametrize('n_estimators', [1, 10]) @pytest.mark.parametrize('max_depth', [2, 10, 20]) -@pytest.mark.parametrize('storage_type', ['False', 'True']) +@pytest.mark.parametrize('storage_type', [False, True]) @pytest.mark.parametrize('model_class', [GradientBoostingRegressor, RandomForestRegressor]) @pytest.mark.xfail(not check_min_treelite_version(), @@ -269,7 +268,7 @@ def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth, storage_type, model_class): # skip depth 20 for dense tests - if max_depth == 20 and storage_type == 'False': + if max_depth == 20 and not storage_type: return # settings @@ -303,7 +302,7 @@ def test_fil_skl_regression(n_rows, n_columns, n_estimators, max_depth, skl_mse = mean_squared_error(y_validation, skl_preds) - algo = 'NAIVE' if storage_type == 'True' else 'BATCH_TREE_REORG' + algo = 'NAIVE' if storage_type else 'BATCH_TREE_REORG' fm = ForestInference.load_from_sklearn(skl_model, algo=algo, @@ -355,8 +354,7 @@ def test_output_algos(algo, small_classifier_and_preds): @pytest.mark.skipif(has_xgboost() is False, reason="need to install xgboost") @pytest.mark.parametrize('storage_type', - ['AUTO', 'False', 'True', 'auto', 'dense', - 'True']) + [False, True, 'auto']) def test_output_storage_type(storage_type, small_classifier_and_preds): model_path, X, xgb_preds = small_classifier_and_preds fm = ForestInference.load(model_path, From 4e350d53cdd4dd59ae99c8ce3cadd4f1c06af594 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Sun, 28 Jun 2020 23:26:28 -0500 Subject: [PATCH 29/32] update CHANGELOG.md --- CHANGELOG.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eda5bc861e..946cb7ab54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,11 +26,6 @@ - PR #2340: Import ARIMA in the root init file and fix the `test_fit_function` test - PR #2408: Install meta packages for dependencies - PR #2417: Move doc customization scripts to Jenkins -<<<<<<< HEAD -- PR #2411 Refactor Mixin classes and use in classifier/regressor estimators -- PR #2237: Refactor RF cython code -- PR #2403 Support for input and output type consistency in logistic regression predict_proba -======= - PR #2433: Add libcumlprims_mg to CMake - PR #2420: Add and set convert_dtype default to True in estimator fit methods - PR #2411: Refactor Mixin classes and use in classifier/regressor estimators @@ -40,7 +35,7 @@ - PR #2440: Use Treelite Conda package - PR #2403: Support for input and output type consistency in logistic regression predict_proba - PR #2468: Add `_n_features_in_` attribute to all single GPU estimators that implement fit ->>>>>>> 119f8b61d7613b50fec63be10633415101c978a5 +- PR #2237: Refactor RF cython code ## Bug Fixes - PR #2369: Update RF code to fix set_params memory leak From 78e8b34e0dbeaf2d46bf03de8ec1e11ec0478e65 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Tue, 30 Jun 2020 21:38:02 -0500 Subject: [PATCH 30/32] update benchmark algo.py file --- python/cuml/benchmark/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/benchmark/algorithms.py b/python/cuml/benchmark/algorithms.py index e2d90f57bd..351d3b23f0 100644 --- a/python/cuml/benchmark/algorithms.py +++ b/python/cuml/benchmark/algorithms.py @@ -396,7 +396,7 @@ def all_algorithms(): fil_algo="AUTO", output_class=False, threshold=0.5, - storage_type="AUTO", + storage_type="auto", ), name="FIL", accepts_labels=False, From ddc5910abc9751924386b41f623fe0b31fc0b681 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Thu, 2 Jul 2020 17:46:59 -0500 Subject: [PATCH 31/32] removed unnecessary cimport from common.pyx --- python/cuml/ensemble/randomforest_common.pyx | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 4e765ed9ae..51b77de285 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -33,8 +33,6 @@ from cuml.ensemble.randomforest_shared import treelite_serialize, \ from cuml.ensemble.randomforest_shared cimport * from cuml.common import input_to_cuml_array, rmm_cupy_ary -cimport cython - class BaseRandomForestModel(Base): variables = ['n_estimators', 'max_depth', 'handle', From 33a0d898ec00dea9c51a1fa457fefb6c425477e7 Mon Sep 17 00:00:00 2001 From: salonijain27 Date: Mon, 13 Jul 2020 01:44:30 -0500 Subject: [PATCH 32/32] resolve merge conflicts --- python/cuml/dask/ensemble/base.py | 7 +++++-- python/cuml/dask/ensemble/randomforestclassifier.py | 5 ++--- python/cuml/ensemble/randomforest_common.pyx | 2 ++ python/cuml/ensemble/randomforestregressor.pyx | 1 + python/cuml/test/dask/test_random_forest.py | 2 +- 5 files changed, 11 insertions(+), 6 deletions(-) diff --git a/python/cuml/dask/ensemble/base.py b/python/cuml/dask/ensemble/base.py index 090feabb2f..35eaad828d 100644 --- a/python/cuml/dask/ensemble/base.py +++ b/python/cuml/dask/ensemble/base.py @@ -85,6 +85,7 @@ def _estimators_per_worker(self, n_estimators): def _fit(self, model, dataset, convert_dtype): data = DistributedDataHandler.create(dataset, client=self.client) + print(" data : ", data) self.datatype = data.datatype if self.datatype == 'cudf': has_float64 = (dataset[0].dtypes.any() == np.float64) @@ -101,6 +102,7 @@ def _fit(self, model, dataset, convert_dtype): len(dask.array.unique(labels).compute()) labels = self.client.persist(dataset[1]) futures = list() + print(" data.worker_to_parts.items() : ", data.worker_to_parts.items()) for idx, (worker, worker_data) in \ enumerate(data.worker_to_parts.items()): futures.append( @@ -112,6 +114,7 @@ def _fit(self, model, dataset, convert_dtype): workers=[worker], pure=False) ) + print(" futures : ", futures) wait_and_raise_from_futures(futures) return self @@ -142,8 +145,8 @@ def _concat_treelite_models(self): return model def _predict_using_fil(self, X, delayed, **kwargs): - if self.local_model is None: - self.local_model = self._concat_treelite_models() + if self._get_internal_model() is None: + self._set_internal_model(self._concat_treelite_models()) data = DistributedDataHandler.create(X, client=self.client) self.datatype = data.datatype if self.local_model is None: diff --git a/python/cuml/dask/ensemble/randomforestclassifier.py b/python/cuml/dask/ensemble/randomforestclassifier.py index 95ef5883be..e919d33f4f 100755 --- a/python/cuml/dask/ensemble/randomforestclassifier.py +++ b/python/cuml/dask/ensemble/randomforestclassifier.py @@ -438,9 +438,8 @@ def predict_proba(self, X, y : NumPy Dask cuDF dataframe or CuPy backed Dask Array (n_rows, n_classes) """ - if self.local_model is None: - self.local_model = self._concat_treelite_models() - + if self._get_internal_model() is None: + self._set_internal_model(self._concat_treelite_models()) data = DistributedDataHandler.create(X, client=self.client) self.datatype = data.datatype return self._predict_proba(X, delayed, **kwargs) diff --git a/python/cuml/ensemble/randomforest_common.pyx b/python/cuml/ensemble/randomforest_common.pyx index 51b77de285..487fbdfba0 100644 --- a/python/cuml/ensemble/randomforest_common.pyx +++ b/python/cuml/ensemble/randomforest_common.pyx @@ -227,6 +227,8 @@ class BaseRandomForestModel(Base): X_m, self.n_rows, self.n_cols, self.dtype = \ input_to_cuml_array(X, check_dtype=[np.float32, np.float64], order='F') + print(" shape of input data, rows : ", self.n_rows) + print(" shape of input data, cols : ", self.n_cols) if self.n_bins > self.n_rows: raise ValueError("The number of bins,`n_bins` can not be greater" " than the number of samples used for training.") diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 18351f8150..fff60902d7 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -442,6 +442,7 @@ class RandomForestRegressor(BaseRandomForestModel, RegressorMixin): # make sure that the `fit` is complete before the following delete # call happens self.handle.sync() + print(" fit model : ", self.print_summary()) del X_m del y_m return self diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py index a6f8ad927d..0e4a393733 100644 --- a/python/cuml/test/dask/test_random_forest.py +++ b/python/cuml/test/dask/test_random_forest.py @@ -293,7 +293,7 @@ def test_rf_concatenation_dask(client, model_type): res1 = cu_rf_mg.predict(X_df) res1.compute() local_tl = TreeliteModel.from_treelite_model_handle( - cu_rf_mg.local_model._obtain_treelite_handle(), + cu_rf_mg.internal_model._obtain_treelite_handle(), take_handle_ownership=False) assert local_tl.num_trees == n_estimators