-
Notifications
You must be signed in to change notification settings - Fork 546
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Transforms RandomForest estimators non-consecutive labels to consecutive labels where appropriate #4780
Transforms RandomForest estimators non-consecutive labels to consecutive labels where appropriate #4780
Changes from 2 commits
12efd59
bbdb64d
974446c
de39e28
7f934ac
b0b7389
e2ee5be
2448689
489329d
e8adf36
9bd7a87
396a498
8cd8d3a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -307,6 +307,66 @@ def test_rf_classification(small_clf, datatype, max_samples, max_features): | |
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)] | ||
) | ||
@pytest.mark.parametrize("datatype", [np.float32, np.float64]) | ||
@pytest.mark.parametrize("max_features", [1.0, "auto", "log2", "sqrt"]) | ||
@pytest.mark.parametrize("b", [0, 5, -5, 10]) | ||
@pytest.mark.parametrize("a", [1, 2, 3]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we need this full matrix of tests for testing the monotonic case, one combination for each datatype would be enough? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure, I will fix one value for a, b and max_features. |
||
def test_rf_classification_unorder(small_clf, datatype, max_samples, max_features, a, b): | ||
use_handle = True | ||
|
||
X, y = small_clf | ||
X = X.astype(datatype) | ||
y = y.astype(np.int32) | ||
X_train, X_test, y_train, y_test = train_test_split( | ||
X, y, train_size=0.8, random_state=0 | ||
) | ||
# Create a handle for the cuml model | ||
handle, stream = get_handle(use_handle, n_streams=1) | ||
|
||
# Initialize, fit and predict using cuML's | ||
# random forest classification model | ||
cuml_model = curfc( | ||
max_features=max_features, | ||
max_samples=max_samples, | ||
n_bins=16, | ||
split_criterion=0, | ||
min_samples_leaf=2, | ||
random_state=123, | ||
n_streams=1, | ||
n_estimators=40, | ||
handle=handle, | ||
max_leaves=-1, | ||
max_depth=16, | ||
) | ||
#affine transformation | ||
y_train = a*y_train+b | ||
cuml_model.fit(X_train, y_train) | ||
|
||
fil_preds = cuml_model.predict( | ||
X_test, predict_model="GPU", threshold=0.5, algo="auto" | ||
) | ||
cu_preds = cuml_model.predict(X_test, predict_model="CPU") | ||
fil_preds = np.reshape(fil_preds, np.shape(cu_preds)) | ||
cuml_acc = accuracy_score(y_test, cu_preds) | ||
fil_acc = accuracy_score(y_test, fil_preds) | ||
if X.shape[0] < 500000: | ||
sk_model = skrfc( | ||
n_estimators=40, | ||
max_depth=16, | ||
min_samples_split=2, | ||
max_features=max_features, | ||
random_state=10, | ||
) | ||
sk_model.fit(X_train, y_train) | ||
sk_preds = sk_model.predict(X_test) | ||
sk_acc = accuracy_score(y_test, sk_preds) | ||
assert fil_acc >= (sk_acc - 0.07) | ||
assert fil_acc >= (cuml_acc - 0.07) # to be changed to 0.02. see issue #3910: https://github.com/rapidsai/cuml/issues/3910 # noqa | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"max_samples", [unit_param(1.0), quality_param(0.90), stress_param(0.95)] | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should be reusing existing primitives where at all possible and using the make_monotonic primitive to do this. That allows us to optimize this specific operation once and have it benefit all uses.