diff --git a/cpp/include/cuml/tree/flatnode.h b/cpp/include/cuml/tree/flatnode.h index 138182e550..77dbc86a01 100644 --- a/cpp/include/cuml/tree/flatnode.h +++ b/cpp/include/cuml/tree/flatnode.h @@ -32,6 +32,7 @@ struct SparseTreeNode { DataT best_metric_val; IdxT left_child_id = IdxT(-1); uint32_t unique_id = UINT32_MAX; + uint32_t instance_count = UINT32_MAX; // UINT32_MAX indicates n/a }; template diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index 6a42e17115..e46f49fb9f 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -37,6 +37,7 @@ void convertToSparse(const Builder& b, for (IdxT i = 0; i < b.h_total_nodes; ++i) { const auto& hnode = h_nodes[i].info; sparsetree[i + len] = hnode; + sparsetree[i + len].instance_count = h_nodes[i].count; if (hnode.left_child_id != -1) sparsetree[i + len].left_child_id += len; } } diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh index 98d6e5300c..e3f2dc6296 100644 --- a/cpp/src/decisiontree/decisiontree_impl.cuh +++ b/cpp/src/decisiontree/decisiontree_impl.cuh @@ -118,8 +118,11 @@ std::string get_node_json(const std::string &prefix, oss << prefix << "{\"nodeid\": " << idx << ", \"split_feature\": " << node.colid << ", \"split_threshold\": " << to_string_high_precision(node.quesval) - << ", \"gain\": " << to_string_high_precision(node.best_metric_val) - << ", \"yes\": " << node.left_child_id + << ", \"gain\": " << to_string_high_precision(node.best_metric_val); + if (node.instance_count != UINT32_MAX) { + oss << ", \"instance_count\": " << node.instance_count; + } + oss << ", \"yes\": " << node.left_child_id << ", \"no\": " << (node.left_child_id + 1) << ", \"children\": [\n"; // enter the next tree level - left and right branch oss << get_node_json(prefix + " ", sparsetree, node.left_child_id) << ",\n" @@ -128,8 +131,11 @@ std::string get_node_json(const std::string &prefix, << prefix << "]}"; } else { oss << prefix << "{\"nodeid\": " << idx - << ", \"leaf_value\": " << to_string_high_precision(node.prediction) - << "}"; + << ", \"leaf_value\": " << to_string_high_precision(node.prediction); + if (node.instance_count != UINT32_MAX) { + oss << ", \"instance_count\": " << node.instance_count; + } + oss << "}"; } return oss.str(); } diff --git a/python/cuml/test/dask/test_random_forest.py b/python/cuml/test/dask/test_random_forest.py index 927927f127..f3cfe1caa5 100644 --- a/python/cuml/test/dask/test_random_forest.py +++ b/python/cuml/test/dask/test_random_forest.py @@ -453,6 +453,49 @@ def predict_with_json_rf_regressor(rf, x): np.testing.assert_almost_equal(pred, expected_pred, decimal=6) +@pytest.mark.parametrize('max_depth', [1, 2, 3, 5, 10, 15, 20]) +@pytest.mark.parametrize('n_estimators', [5, 10, 20]) +def test_rf_instance_count(client, max_depth, n_estimators): + n_workers = len(client.scheduler_info()['workers']) + if n_estimators < n_workers: + err_msg = "n_estimators cannot be lower than number of dask workers" + pytest.xfail(err_msg) + + X, y = make_classification(n_samples=350, n_features=20, + n_clusters_per_class=1, n_informative=10, + random_state=123, n_classes=2) + X = X.astype(np.float32) + cu_rf_mg = cuRFC_mg(max_features=1.0, max_samples=1.0, + n_bins=16, split_algo=1, split_criterion=0, + min_samples_leaf=2, seed=23707, n_streams=1, + n_estimators=n_estimators, max_leaves=-1, + max_depth=max_depth, use_experimental_backend=True) + y = y.astype(np.int32) + + X_dask, y_dask = _prep_training_data(client, X, y, partitions_per_worker=2) + cu_rf_mg.fit(X_dask, y_dask) + json_out = cu_rf_mg.get_json() + json_obj = json.loads(json_out) + + # The instance count of each node must be equal to the sum of + # the instance counts of its children + def check_instance_count_for_non_leaf(tree): + assert 'instance_count' in tree + if 'children' not in tree: + return + assert 'instance_count' in tree['children'][0] + assert 'instance_count' in tree['children'][1] + assert (tree['instance_count'] == tree['children'][0]['instance_count'] + + tree['children'][1]['instance_count']) + check_instance_count_for_non_leaf(tree['children'][0]) + check_instance_count_for_non_leaf(tree['children'][1]) + + for tree in json_obj: + check_instance_count_for_non_leaf(tree) + # The root's count should be equal to the number of rows in the data + assert tree['instance_count'] == X.shape[0] + + @pytest.mark.parametrize('estimator_type', ['regression', 'classification']) def test_rf_get_combined_model_right_aftter_fit(client, estimator_type): max_depth = 3 diff --git a/python/cuml/test/test_random_forest.py b/python/cuml/test/test_random_forest.py index 0403c20ba5..8a86b4b730 100644 --- a/python/cuml/test/test_random_forest.py +++ b/python/cuml/test/test_random_forest.py @@ -852,6 +852,58 @@ def predict_with_json_rf_regressor(rf, x): np.testing.assert_almost_equal(pred, expected_pred, decimal=6) +@pytest.mark.parametrize('max_depth', [1, 2, 3, 5, 10, 15, 20]) +@pytest.mark.parametrize('n_estimators', [5, 10, 20]) +@pytest.mark.parametrize('use_experimental_backend', [True, False]) +def test_rf_instance_count(max_depth, n_estimators, use_experimental_backend): + X, y = make_classification(n_samples=350, n_features=20, + n_clusters_per_class=1, n_informative=10, + random_state=123, n_classes=2) + X = X.astype(np.float32) + cuml_model = curfc(max_features=1.0, max_samples=1.0, + n_bins=16, split_algo=1, split_criterion=0, + min_samples_leaf=2, seed=23707, n_streams=1, + n_estimators=n_estimators, max_leaves=-1, + max_depth=max_depth, + use_experimental_backend=use_experimental_backend) + y = y.astype(np.int32) + + # Train model on the data + cuml_model.fit(X, y) + + json_out = cuml_model.get_json() + json_obj = json.loads(json_out) + + # The instance count of each node must be equal to the sum of + # the instance counts of its children. Note that the instance count + # is only available with the new backend. + if use_experimental_backend: + def check_instance_count_for_non_leaf(tree): + assert 'instance_count' in tree + if 'children' not in tree: + return + assert 'instance_count' in tree['children'][0] + assert 'instance_count' in tree['children'][1] + assert (tree['instance_count'] + == tree['children'][0]['instance_count'] + + tree['children'][1]['instance_count']) + check_instance_count_for_non_leaf(tree['children'][0]) + check_instance_count_for_non_leaf(tree['children'][1]) + for tree in json_obj: + check_instance_count_for_non_leaf(tree) + # The root's count must be equal to the number of rows in the data + assert tree['instance_count'] == X.shape[0] + else: + def assert_instance_count_absent(tree): + assert 'instance_count' not in tree + if 'children' not in tree: + return + assert_instance_count_absent(tree['children'][0]) + assert_instance_count_absent(tree['children'][1]) + for tree in json_obj: + assert_instance_count_absent(tree) + + @pytest.mark.memleak @pytest.mark.parametrize('estimator_type', ['classification']) def test_rf_host_memory_leak(large_clf, estimator_type): @@ -987,4 +1039,7 @@ def test_rf_regression_with_identical_labels(split_criterion, clf.fit(X, y) model_dump = json.loads(clf.get_json()) assert len(model_dump) == 1 - assert model_dump[0] == {'nodeid': 0, 'leaf_value': 1.0} + expected_dump = {'nodeid': 0, 'leaf_value': 1.0} + if use_experimental_backend: + expected_dump['instance_count'] = 5 + assert model_dump[0] == expected_dump