Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use the new RF backend by default for classification #3686

Merged
merged 14 commits into from
Apr 7, 2021
Merged
2 changes: 1 addition & 1 deletion cpp/test/sg/rf_accuracy_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ class RFClassifierAccuracyTest : public ::testing::TestWithParam<RFInputs> {
sc, /* split_criterion */
false, /* quantile_per_tree */
1, /* n_streams */
false, /* use_experimental_backend */
true, /* use_experimental_backend */
128 /* max_batch_size */
);
}
Expand Down
6 changes: 3 additions & 3 deletions cpp/test/sg/rf_depth_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam<int> {
protected:
void basicTest() {
const int max_depth = ::testing::TestWithParam<int>::GetParam();
params = RfInputs<T>{5000,
params = RfInputs<T>{10000,
10,
1,
1.0f,
Expand All @@ -73,7 +73,7 @@ class RfClassifierDepthTest : public ::testing::TestWithParam<int> {
params.split_algo, params.min_samples_leaf, params.min_samples_split,
params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
params.n_trees, params.max_samples, 0, params.split_criterion, false,
params.n_streams, false, 128);
params.n_streams, true, 128);
hcho3 marked this conversation as resolved.
Show resolved Hide resolved

int data_len = params.n_rows * params.n_cols;
raft::allocate(data, data_len);
Expand Down Expand Up @@ -165,7 +165,7 @@ class RfRegressorDepthTest : public ::testing::TestWithParam<int> {
params.split_algo, params.min_samples_leaf, params.min_samples_split,
params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
params.n_trees, params.max_samples, 0, params.split_criterion, false,
params.n_streams, false, 128);
params.n_streams, true, 128);

int data_len = params.n_rows * params.n_cols;
raft::allocate(data, data_len);
Expand Down
10 changes: 5 additions & 5 deletions cpp/test/sg/rf_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class RfClassifierTest : public ::testing::TestWithParam<RfInputs<T>> {
params.split_algo, params.min_samples_leaf, params.min_samples_split,
params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
params.n_trees, params.max_samples, 0, params.split_criterion, false,
params.n_streams, false, 128);
params.n_streams, true, 128);
hcho3 marked this conversation as resolved.
Show resolved Hide resolved

//--------------------------------------------------------
// Random Forest
Expand Down Expand Up @@ -258,7 +258,7 @@ const std::vector<RfInputs<float>> inputsf2_clf = {
CRITERION::
GINI}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 2, 0.0, 2,
1, 2, 0.0, 1,
CRITERION::
CRITERION_END}, //forest with 10 trees, with bootstrap and column subsampling enabled, 3 bins, different split algorithm
{4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
Expand All @@ -270,7 +270,7 @@ const std::vector<RfInputs<float>> inputsf2_clf = {
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
2, CRITERION::ENTROPY},
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 2, 0.0, 2, CRITERION::ENTROPY},
1, 2, 0.0, 2, CRITERION::ENTROPY},
{50, 10, 10, 0.8f, 0.8f, 10, 7, -1, true, true, 3,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 2, 0.0, 2, CRITERION::ENTROPY}};

Expand All @@ -284,7 +284,7 @@ const std::vector<RfInputs<double>> inputsd2_clf = { // Same as inputsf2_clf
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
2, CRITERION::GINI},
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 2, 0.0, 2, CRITERION::CRITERION_END},
1, 2, 0.0, 2, CRITERION::CRITERION_END},
{4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
2, CRITERION::ENTROPY},
{4, 2, 1, 1.0f, 1.0f, 4, 7, -1, false, false, 4, SPLIT_ALGO::HIST, 2, 2, 0.0,
Expand All @@ -294,7 +294,7 @@ const std::vector<RfInputs<double>> inputsd2_clf = { // Same as inputsf2_clf
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::HIST, 2, 2, 0.0,
2, CRITERION::ENTROPY},
{4, 2, 10, 0.8f, 0.8f, 4, 7, -1, true, false, 3, SPLIT_ALGO::GLOBAL_QUANTILE,
2, 2, 0.0, 2, CRITERION::ENTROPY},
1, 2, 0.0, 2, CRITERION::ENTROPY},
{50, 10, 10, 0.8f, 0.8f, 10, 7, -1, true, true, 3,
SPLIT_ALGO::GLOBAL_QUANTILE, 2, 2, 0.0, 2, CRITERION::ENTROPY}};

Expand Down
2 changes: 1 addition & 1 deletion cpp/test/sg/rf_treelite_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class RfTreeliteTestCommon : public ::testing::TestWithParam<RfInputs<T>> {
params.split_algo, params.min_samples_leaf, params.min_samples_split,
params.min_impurity_decrease, params.bootstrap_features, params.bootstrap,
params.n_trees, params.max_samples, 0, params.split_criterion, false,
params.n_streams, false, 128);
params.n_streams, true, 128);

handle.reset(new raft::handle_t(rf_params.n_streams));

Expand Down
8 changes: 8 additions & 0 deletions python/cuml/dask/ensemble/randomforestclassifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ class RandomForestClassifier(BaseRandomForestModel, DelayedPredictionMixin,
quantile_per_tree : boolean (default = False)
Whether quantile is computed for individual RF trees.
Only relevant for GLOBAL_QUANTILE split_algo.
use_experimental_backend : boolean (default = True)
If set to true and the following conditions are also met, a new
experimental backend for decision tree training will be used. The
new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
and `quantile_per_tree = False` (No per tree quantile computation).
The new backend is considered stable for classification tasks but
not yet for the regression task. The RAPIDS team is continuing
optimization and evaluation of the new backend for regression tasks.
n_streams : int (default = 4 )
Number of parallel streams used for forest building
workers : optional, list of strings
Expand Down
11 changes: 11 additions & 0 deletions python/cuml/dask/ensemble/randomforestregressor.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,17 @@ class RandomForestRegressor(BaseRandomForestModel, DelayedPredictionMixin,
for median of abs error : 'median_ae'
for mean of abs error : 'mean_ae'
for mean square error' : 'mse'
quantile_per_tree : boolean (default = False)
Whether quantile is computed for individual RF trees.
Only relevant for GLOBAL_QUANTILE split_algo.
use_experimental_backend : boolean (default = False)
If set to true and the following conditions are also met, a new
experimental backend for decision tree training will be used. The
new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
and `quantile_per_tree = False` (No per tree quantile computation).
The new backend is considered stable for classification tasks but
not yet for the regression task. The RAPIDS team is continuing
optimization and evaluation of the new backend for regression tasks.
n_streams : int (default = 4 )
Number of parallel streams used for forest building
workers : optional, list of strings
Expand Down
22 changes: 15 additions & 7 deletions python/cuml/ensemble/randomforestclassifier.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -208,8 +208,10 @@ class RandomForestClassifier(BaseRandomForestModel,
If 'auto' then max_features=1/sqrt(n_features).
If 'sqrt' then max_features=1/sqrt(n_features).
If 'log2' then max_features=log2(n_features)/n_features.
n_bins : int (default = 8)
n_bins : int (default = 32)
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
Number of bins used by the split algorithm.
For large problems, particularly those with highly-skewed input data,
increasing the number of bins may improve accuracy.
min_samples_leaf : int or float (default = 1)
The minimum number of samples (rows) in each leaf node.
If int, then min_samples_leaf represents the minimum number.
Expand All @@ -232,11 +234,14 @@ class RandomForestClassifier(BaseRandomForestModel,
.. deprecated:: 0.19
Parameter 'quantile_per_tree' is deprecated and will be removed in
subsequent release.
use_experimental_backend : boolean (default = False)
If set to true and the following conditions are also met, experimental
decision tree training implementation would be used only if
`split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
(No per tree quantile computation).
use_experimental_backend : boolean (default = True)
hcho3 marked this conversation as resolved.
Show resolved Hide resolved
If set to true and the following conditions are also met, a new
experimental backend for decision tree training will be used. The
new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
and `quantile_per_tree = False` (No per tree quantile computation).
The new backend is considered stable for the classification task but
not for the regression task. We are currently evaluating the impact
of the new backend for the regression task.
max_batch_size: int (default = 128)
Maximum number of nodes that can be processed in a given batch. This is
used only when 'use_experimental_backend' is true. Does not currently
Expand Down Expand Up @@ -265,7 +270,8 @@ class RandomForestClassifier(BaseRandomForestModel,
"""

def __init__(self, *, split_criterion=0, handle=None, verbose=False,
output_type=None, **kwargs):
output_type=None, n_bins=32, use_experimental_backend=True,
**kwargs):

self.RF_type = CLASSIFICATION
self.num_classes = 2
Expand All @@ -274,6 +280,8 @@ class RandomForestClassifier(BaseRandomForestModel,
handle=handle,
verbose=verbose,
output_type=output_type,
n_bins=n_bins,
use_experimental_backend=use_experimental_backend,
**kwargs)

"""
Expand Down
13 changes: 9 additions & 4 deletions python/cuml/ensemble/randomforestregressor.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ class RandomForestRegressor(BaseRandomForestModel,
If 'log2' then max_features=log2(n_features)/n_features.
n_bins : int (default = 8)
Number of bins used by the split algorithm.
For large problems, particularly those with highly-skewed input data,
increasing the number of bins may improve accuracy.
min_samples_leaf : int or float (default = 1)
The minimum number of samples (rows) in each leaf node.
If int, then min_samples_leaf represents the minimum number.
Expand Down Expand Up @@ -224,10 +226,13 @@ class RandomForestRegressor(BaseRandomForestModel,
Parameter 'quantile_per_tree' is deprecated and will be removed in
subsequent release.
use_experimental_backend : boolean (default = False)
If set to true and following conditions are also met, experimental
decision tree training implementation would be used only if
`split_algo = 1` (GLOBAL_QUANTILE) and `quantile_per_tree = False`
(No per tree quantile computation).
If set to true and the following conditions are also met, a new
experimental backend for decision tree training will be used. The
new backend is available only if `split_algo = 1` (GLOBAL_QUANTILE)
and `quantile_per_tree = False` (No per tree quantile computation).
The new backend is considered stable for the classification task but
not for the regression task. We are currently evaluating the impact
of the new backend for the regression task.
max_batch_size: int (default = 128)
Maximum number of nodes that can be processed in a given batch. This is
used only when 'use_experimental_backend' is true.
Expand Down
2 changes: 1 addition & 1 deletion python/cuml/test/dask/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,7 +470,7 @@ def test_rf_instance_count(client, max_depth, n_estimators):
n_bins=16, split_algo=1, split_criterion=0,
min_samples_leaf=2, seed=23707, n_streams=1,
n_estimators=n_estimators, max_leaves=-1,
max_depth=max_depth, use_experimental_backend=True)
max_depth=max_depth)
y = y.astype(np.int32)

X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2)
Expand Down
78 changes: 31 additions & 47 deletions python/cuml/test/test_random_forest.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,8 @@ def special_reg(request):
@pytest.mark.parametrize('datatype', [np.float32])
@pytest.mark.parametrize('split_algo', [0, 1])
@pytest.mark.parametrize('max_features', [1.0, 'auto', 'log2', 'sqrt'])
@pytest.mark.parametrize('use_experimental_backend', [True, False])
def test_rf_classification(small_clf, datatype, split_algo,
max_samples, max_features,
use_experimental_backend):
max_samples, max_features):
use_handle = True

X, y = small_clf
Expand All @@ -165,26 +163,24 @@ def test_rf_classification(small_clf, datatype, split_algo,
n_bins=16, split_algo=split_algo, split_criterion=0,
min_samples_leaf=2, random_state=123, n_streams=1,
n_estimators=40, handle=handle, max_leaves=-1,
max_depth=16,
use_experimental_backend=use_experimental_backend)
max_depth=16)
f = io.StringIO()
with redirect_stdout(f):
cuml_model.fit(X_train, y_train)
captured_stdout = f.getvalue()
if use_experimental_backend:
is_fallback_used = False
if split_algo != 1:
assert ('Experimental backend does not yet support histogram ' +
'split algorithm' in captured_stdout)
is_fallback_used = True
if is_fallback_used:
assert ('Not using the experimental backend due to above ' +
'mentioned reason(s)' in captured_stdout)
else:
assert ('Using experimental backend for growing trees'
in captured_stdout)

is_fallback_used = False
if split_algo != 1:
assert ('Experimental backend does not yet support histogram ' +
'split algorithm' in captured_stdout)
is_fallback_used = True
if is_fallback_used:
assert ('Not using the experimental backend due to above ' +
'mentioned reason(s)' in captured_stdout)
else:
assert captured_stdout == ''
assert ('Using experimental backend for growing trees'
in captured_stdout)

fil_preds = cuml_model.predict(X_test,
predict_model="GPU",
threshold=0.5,
Expand Down Expand Up @@ -856,8 +852,7 @@ def predict_with_json_rf_regressor(rf, x):

@pytest.mark.parametrize('max_depth', [1, 2, 3, 5, 10, 15, 20])
@pytest.mark.parametrize('n_estimators', [5, 10, 20])
@pytest.mark.parametrize('use_experimental_backend', [True, False])
def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
def test_rf_instance_count(max_depth, n_estimators):
X, y = make_classification(n_samples=350, n_features=20,
n_clusters_per_class=1, n_informative=10,
random_state=123, n_classes=2)
Expand All @@ -866,8 +861,7 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
n_bins=16, split_algo=1, split_criterion=0,
min_samples_leaf=2, random_state=23707, n_streams=1,
n_estimators=n_estimators, max_leaves=-1,
max_depth=max_depth,
use_experimental_backend=use_experimental_backend)
max_depth=max_depth)
y = y.astype(np.int32)

# Train model on the data
Expand All @@ -879,31 +873,21 @@ def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend):
# The instance count of each node must be equal to the sum of
# the instance counts of its children. Note that the instance count
# is only available with the new backend.
if use_experimental_backend:
def check_instance_count_for_non_leaf(tree):
assert 'instance_count' in tree
if 'children' not in tree:
return
assert 'instance_count' in tree['children'][0]
assert 'instance_count' in tree['children'][1]
assert (tree['instance_count']
== tree['children'][0]['instance_count']
+ tree['children'][1]['instance_count'])
check_instance_count_for_non_leaf(tree['children'][0])
check_instance_count_for_non_leaf(tree['children'][1])
for tree in json_obj:
check_instance_count_for_non_leaf(tree)
# The root's count must be equal to the number of rows in the data
assert tree['instance_count'] == X.shape[0]
else:
def assert_instance_count_absent(tree):
assert 'instance_count' not in tree
if 'children' not in tree:
return
assert_instance_count_absent(tree['children'][0])
assert_instance_count_absent(tree['children'][1])
for tree in json_obj:
assert_instance_count_absent(tree)
def check_instance_count_for_non_leaf(tree):
assert 'instance_count' in tree
if 'children' not in tree:
return
assert 'instance_count' in tree['children'][0]
assert 'instance_count' in tree['children'][1]
assert (tree['instance_count']
== tree['children'][0]['instance_count']
+ tree['children'][1]['instance_count'])
check_instance_count_for_non_leaf(tree['children'][0])
check_instance_count_for_non_leaf(tree['children'][1])
for tree in json_obj:
check_instance_count_for_non_leaf(tree)
# The root's count must be equal to the number of rows in the data
assert tree['instance_count'] == X.shape[0]


@pytest.mark.memleak
Expand Down