From 8fa2b9067e67af4d0663da9880683f3ec24a3d04 Mon Sep 17 00:00:00 2001 From: Venkat Date: Thu, 25 Feb 2021 23:08:57 +0530 Subject: [PATCH] NVTX Markers for RF and RF-backend (#3014) * This PR adds NVTX Markers to major time-consuming function calls of the regressors and classifiers of RF and DecisionTrees. * They span both RandomForest and DecisionTree code-bases Authors: - Venkat (@venkywonka) - John Zedlewski (@JohnZed) Approvers: - Thejaswi. N. S (@teju85) - AJ Schmidt (@ajschmidt8) - John Zedlewski (@JohnZed) URL: https://github.com/rapidsai/cuml/pull/3014 --- .../batched-levelalgo/builder.cuh | 4 +++ .../batched-levelalgo/builder_base.cuh | 28 ++++++++++++++++++- cpp/src/decisiontree/decisiontree_impl.cuh | 13 ++++++++- .../decisiontree/levelalgo/common_helper.cuh | 9 +++++- .../levelalgo/levelfunc_classifier.cuh | 19 +++++++++++-- .../levelalgo/levelfunc_regressor.cuh | 18 ++++++++++-- .../levelalgo/levelhelper_classifier.cuh | 13 ++++++++- .../levelalgo/levelhelper_regressor.cuh | 16 ++++++++++- cpp/src/decisiontree/quantile/quantile.cuh | 24 ++++++++++++---- cpp/src/randomforest/randomforest.cu | 8 ++++++ cpp/src/randomforest/randomforest_impl.cuh | 10 +++++++ .../cuml/ensemble/randomforestclassifier.pyx | 9 ++++++ .../cuml/ensemble/randomforestregressor.pyx | 8 ++++++ 13 files changed, 164 insertions(+), 15 deletions(-) diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh index 5a7aa9ab35..6a42e17115 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh @@ -21,6 +21,8 @@ #include "builder_base.cuh" +#include + namespace ML { namespace DecisionTree { @@ -51,6 +53,7 @@ void grow_tree(std::shared_ptr d_allocator, const DecisionTreeParams& params, cudaStream_t stream, std::vector>& sparsetree, IdxT& num_leaves, IdxT& depth) { + ML::PUSH_RANGE("DecisionTree::grow_tree in batched-levelalgo @builder.cuh"); Builder builder; size_t d_wsize, h_wsize; builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels, @@ -68,6 +71,7 @@ void grow_tree(std::shared_ptr d_allocator, d_buff.release(stream); h_buff.release(stream); convertToSparse(builder, h_nodes.data(), sparsetree); + ML::POP_RANGE(); } /** diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh index 91036b2d47..9f537d5a63 100644 --- a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh +++ b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh @@ -25,6 +25,8 @@ #include "node.cuh" #include "split.cuh" +#include + namespace ML { namespace DecisionTree { @@ -139,6 +141,8 @@ struct Builder { const DataT* data, const LabelT* labels, IdxT totalRows, IdxT totalCols, IdxT sampledRows, IdxT sampledCols, IdxT* rowids, IdxT nclasses, const DataT* quantiles) { + ML::PUSH_RANGE( + "Builder::workspaceSize @builder_base.cuh [batched-levelalgo]"); ASSERT(quantiles != nullptr, "Currently quantiles need to be computed before this call!"); params = p; @@ -200,6 +204,7 @@ struct Builder { calculateAlignedBytes(sizeof(NodeT) * 2 * max_batch); // next_nodes // all nodes in the tree h_wsize = calculateAlignedBytes(sizeof(IdxT)); // h_n_nodes + ML::POP_RANGE(); } /** @@ -210,6 +215,8 @@ struct Builder { * @param[in] h_wspace pinned host buffer needed to store the learned nodes */ void assignWorkspace(char* d_wspace, char* h_wspace) { + ML::PUSH_RANGE( + "Builder::assignWorkspace @builder_base.cuh [batched-levelalgo]"); auto max_batch = params.max_batch_size; auto n_col_blks = n_blks_for_cols; // device @@ -245,6 +252,7 @@ struct Builder { next_nodes = reinterpret_cast(d_wspace); // host h_n_nodes = reinterpret_cast(h_wspace); + ML::POP_RANGE(); } /** @@ -258,6 +266,7 @@ struct Builder { */ void train(std::vector>& h_nodes, IdxT& num_leaves, IdxT& depth, cudaStream_t s) { + ML::PUSH_RANGE("Builder::train @builder_base.cuh [batched-levelalgo]"); init(h_nodes, s); while (true) { IdxT new_nodes = doSplit(h_nodes, s); @@ -267,6 +276,7 @@ struct Builder { } raft::update_host(&num_leaves, n_leaves, 1, s); raft::update_host(&depth, n_depth, 1, s); + ML::POP_RANGE(); } private: @@ -322,6 +332,7 @@ struct Builder { */ IdxT doSplit(std::vector>& h_nodes, cudaStream_t s) { + ML::PUSH_RANGE("Builder::doSplit @bulder_base.cuh [batched-levelalgo]"); auto batchSize = node_end - node_start; // start fresh on the number of *new* nodes created in this batch CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), s)); @@ -338,6 +349,7 @@ struct Builder { } // create child nodes (or make the current ones leaf) auto smemSize = Traits::nodeSplitSmemSize(*this); + ML::PUSH_RANGE("nodeSplitKernel @builder_base.cuh [batched-levelalgo]"); nodeSplitKernel <<>>( @@ -345,6 +357,7 @@ struct Builder { params.max_leaves, params.min_impurity_decrease, input, curr_nodes, next_nodes, n_nodes, splits, n_leaves, h_total_nodes, n_depth); CUDA_CHECK(cudaGetLastError()); + ML::POP_RANGE(); // copy the updated (due to leaf creation) and newly created child nodes raft::update_host(h_n_nodes, n_nodes, 1, s); CUDA_CHECK(cudaStreamSynchronize(s)); @@ -352,6 +365,7 @@ struct Builder { raft::update_host(h_nodes.data() + node_start, curr_nodes, batchSize, s); raft::update_host(h_nodes.data() + h_total_nodes, next_nodes, *h_n_nodes, s); + ML::POP_RANGE(); return *h_n_nodes; } }; // end Builder @@ -391,6 +405,8 @@ struct ClsTraits { static void computeSplit(Builder>& b, IdxT col, IdxT batchSize, CRITERION splitType, cudaStream_t s) { + ML::PUSH_RANGE( + "Builder::computeSplit @builder_base.cuh [batched-levelalgo]"); auto nbins = b.params.n_bins; auto nclasses = b.input.nclasses; auto binSize = nbins * 2 * nclasses; @@ -403,12 +419,16 @@ struct ClsTraits { smemSize += 2 * sizeof(DataT) + 1 * sizeof(int); CUDA_CHECK(cudaMemsetAsync(b.hist, 0, sizeof(int) * b.nHistBins, s)); + ML::PUSH_RANGE( + "computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]"); computeSplitClassificationKernel <<>>( b.hist, b.params.n_bins, b.params.max_depth, b.params.min_samples_split, b.params.min_samples_leaf, b.params.min_impurity_decrease, b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex, b.n_leaves, b.splits, splitType, b.treeid, b.seed); + ML::POP_RANGE(); //computeSplitClassificationKernel + ML::POP_RANGE(); //Builder::computeSplit } /** @@ -460,8 +480,9 @@ struct RegTraits { static void computeSplit(Builder>& b, IdxT col, IdxT batchSize, CRITERION splitType, cudaStream_t s) { + ML::PUSH_RANGE( + "Builder::computeSplit @builder_base.cuh [batched-levelalgo]"); auto n_col_blks = std::min(b.n_blks_for_cols, b.input.nSampledCols - col); - dim3 grid(b.n_blks_for_rows, n_col_blks, batchSize); auto nbins = b.params.n_bins; size_t smemSize = 7 * nbins * sizeof(DataT) + nbins * sizeof(int); @@ -478,6 +499,9 @@ struct RegTraits { CUDA_CHECK(cudaMemsetAsync(b.pred2P, 0, sizeof(DataT) * b.nPredCounts, s)); CUDA_CHECK( cudaMemsetAsync(b.pred_count, 0, sizeof(IdxT) * b.nPredCounts, s)); + + ML::PUSH_RANGE( + "computeSplitRegressionKernel @builder_base.cuh [batched-levelalgo]"); computeSplitRegressionKernel <<>>( b.pred, b.pred2, b.pred2P, b.pred_count, b.params.n_bins, @@ -485,6 +509,8 @@ struct RegTraits { b.params.min_samples_leaf, b.params.min_impurity_decrease, b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex, b.n_leaves, b.splits, b.block_sync, splitType, b.treeid, b.seed); + ML::POP_RANGE(); //computeSplitRegressionKernel + ML::POP_RANGE(); //Builder::computeSplit } /** diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh index fd9a717569..ce0b23fab4 100644 --- a/cpp/src/decisiontree/decisiontree_impl.cuh +++ b/cpp/src/decisiontree/decisiontree_impl.cuh @@ -33,6 +33,8 @@ #include "quantile/quantile.h" #include "treelite_util.h" +#include + namespace ML { bool is_dev_ptr(const void *p) { @@ -262,6 +264,7 @@ void DecisionTreeBase::plant( const int nrows, const L *labels, unsigned int *rowids, const int n_sampled_rows, int unique_labels, const int treeid, uint64_t seed) { + ML::PUSH_RANGE("DecisionTreeBase::plant @decisiontree_impl.cuh"); dinfo.NLocalrows = nrows; dinfo.NGlobalrows = nrows; dinfo.Ncols = ncols; @@ -274,7 +277,7 @@ void DecisionTreeBase::plant( } CUDA_CHECK(cudaStreamSynchronize( tempmem->stream)); // added to ensure accurate measurement - + ML::PUSH_RANGE("DecisionTreeBase::plant::bootstrapping features"); //Bootstrap features unsigned int *h_colids = tempmem->h_colids->data(); if (tree_params.bootstrap_features) { @@ -285,6 +288,7 @@ void DecisionTreeBase::plant( } else { std::iota(h_colids, h_colids + dinfo.Ncols, 0); } + ML::POP_RANGE(); prepare_time = prepare_fit_timer.getElapsedSeconds(); total_temp_mem = tempmem->totalmem; @@ -304,6 +308,7 @@ void DecisionTreeBase::plant( treeid, tempmem); } train_time = timer.getElapsedSeconds(); + ML::POP_RANGE(); } template @@ -484,6 +489,8 @@ void DecisionTreeClassifier::grow_deep_tree( const int n_sampled_rows, const int ncols, const float colper, const int nrows, std::vector> &sparsetree, const int treeid, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTreeClassifier::grow_deep_tree @decisiontree_impl.cuh"); int leaf_cnt = 0; int depth_cnt = 0; grow_deep_tree_classification(data, labels, rowids, ncols, colper, @@ -492,6 +499,7 @@ void DecisionTreeClassifier::grow_deep_tree( sparsetree, treeid, tempmem); this->depth_counter = depth_cnt; this->leaf_counter = leaf_cnt; + ML::POP_RANGE(); } template @@ -500,6 +508,8 @@ void DecisionTreeRegressor::grow_deep_tree( const int n_sampled_rows, const int ncols, const float colper, const int nrows, std::vector> &sparsetree, const int treeid, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTreeRegressor::grow_deep_tree @decisiontree_impl.cuh"); int leaf_cnt = 0; int depth_cnt = 0; grow_deep_tree_regression(data, labels, rowids, ncols, colper, n_sampled_rows, @@ -507,6 +517,7 @@ void DecisionTreeRegressor::grow_deep_tree( sparsetree, treeid, tempmem); this->depth_counter = depth_cnt; this->leaf_counter = leaf_cnt; + ML::POP_RANGE(); } //Class specializations diff --git a/cpp/src/decisiontree/levelalgo/common_helper.cuh b/cpp/src/decisiontree/levelalgo/common_helper.cuh index 5a028817eb..6795e00980 100644 --- a/cpp/src/decisiontree/levelalgo/common_helper.cuh +++ b/cpp/src/decisiontree/levelalgo/common_helper.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ #include #include "common_kernel.cuh" +#include + namespace ML { namespace DecisionTree { @@ -38,6 +40,8 @@ void update_feature_sampling(unsigned int *h_colids, unsigned int *d_colids, std::vector &feature_selector, std::shared_ptr> tempmem, raft::random::Rng &d_rng) { + ML::PUSH_RANGE( + "update_feature_sampling @common_helper.cuh (does feature subsampling)"); if (h_colstart != nullptr) { if (Ncols != ncols_sampled) { std::shuffle(h_colids, h_colids + Ncols, rng); @@ -63,6 +67,7 @@ void update_feature_sampling(unsigned int *h_colids, unsigned int *d_colids, raft::update_device(d_colids, h_colids, ncols_sampled * n_nodes, tempmem->stream); } + ML::POP_RANGE(); } //This function calcualtes min/max from the samples that belong in a given node. This is done for all the nodes at a given level @@ -104,6 +109,7 @@ void get_minmax(const T *data, const unsigned int *flags, void setup_sampling(unsigned int *flagsptr, unsigned int *sample_cnt, const unsigned int *rowids, const int nrows, const int n_sampled_rows, cudaStream_t &stream) { + ML::PUSH_RANGE("DecisionTree::setup_sampling @common_helper.cuh"); CUDA_CHECK(cudaMemsetAsync(sample_cnt, 0, nrows * sizeof(int), stream)); int threads = 256; int blocks = raft::ceildiv(n_sampled_rows, threads); @@ -114,6 +120,7 @@ void setup_sampling(unsigned int *flagsptr, unsigned int *sample_cnt, setup_flags_kernel<<>>(sample_cnt, flagsptr, nrows); CUDA_CHECK(cudaGetLastError()); + ML::POP_RANGE(); //setup_sampling @common_helper.cuh } //This function call the split kernel diff --git a/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh b/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh index e41d495224..d33c82184e 100644 --- a/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh +++ b/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ #include "levelhelper_classifier.cuh" #include "metric.cuh" +#include + namespace ML { namespace DecisionTree { @@ -45,6 +47,8 @@ void grow_deep_tree_classification( const ML::DecisionTree::DecisionTreeParams& tree_params, int& depth_cnt, int& leaf_cnt, std::vector>& sparsetree, const int treeid, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTree::grow_deep_tree_classification @levelfunc_classifier.cuh"); const int ncols_sampled = (int)(colper * Ncols); unsigned int* flagsptr = tempmem->d_flags->data(); unsigned int* sample_cnt = tempmem->d_sample_cnt->data(); @@ -111,6 +115,7 @@ void grow_deep_tree_classification( int scatter_algo_depth = std::min(tempmem->swap_depth, tree_params.max_depth + 1); + ML::PUSH_RANGE("scatter phase @levelfunc_classifier"); for (int depth = 0; (depth < scatter_algo_depth) && (n_nodes_nextitr != 0); depth++) { depth_cnt = depth; @@ -166,12 +171,19 @@ void grow_deep_tree_classification( 2 * n_nodes * n_unique_labels * sizeof(unsigned int)); } } + ML::POP_RANGE(); //scatter phase @levelfunc_classifier.cuh + + ML::PUSH_RANGE("gather phase @levelfunc_classifier.cuh"); // Start of gather algorithm //Convertor CUML_LOG_DEBUG("begin gather "); int lastsize = sparsetree.size() - sparsesize_nextitr; n_nodes = n_nodes_nextitr; - if (n_nodes == 0) return; + if (n_nodes == 0) { + ML::POP_RANGE(); //gather phase ended + ML::POP_RANGE(); //grow_deep_tree_classification end + return; + } unsigned int *d_nodecount, *d_samplelist, *d_nodestart; SparseTreeNode* d_sparsenodes; SparseTreeNode* h_sparsenodes; @@ -250,6 +262,9 @@ void grow_deep_tree_classification( sparsetree.insert(sparsetree.end(), h_sparsenodes, h_sparsenodes + lastsize); } + + ML::POP_RANGE(); //gather phase @levelfunc_classifier.cuh + ML::POP_RANGE(); //grow_deep_tree_classification @levelfunc_classifier.cuh } } // namespace DecisionTree diff --git a/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh index fd1e2fdd44..6a91229f34 100644 --- a/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh +++ b/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -43,6 +43,8 @@ void grow_deep_tree_regression( const ML::DecisionTree::DecisionTreeParams& tree_params, int& depth_cnt, int& leaf_cnt, std::vector>& sparsetree, const int treeid, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTree::grow_deep_tree_classification @levelfunc_regressor.cuh"); const int ncols_sampled = (int)(colper * Ncols); unsigned int* flagsptr = tempmem->d_flags->data(); unsigned int* sample_cnt = tempmem->d_sample_cnt->data(); @@ -117,6 +119,7 @@ void grow_deep_tree_regression( int scatter_algo_depth = std::min(tempmem->swap_depth, tree_params.max_depth + 1); + ML::PUSH_RANGE("scatter phase @levelfunc_regressor"); for (int depth = 0; (depth < scatter_algo_depth) && (n_nodes_nextitr != 0); depth++) { depth_cnt = depth; @@ -173,13 +176,18 @@ void grow_deep_tree_regression( n_nodes, tree_params.split_algo, d_split_colidx, d_split_binidx, d_new_node_flags, flagsptr, tempmem); } - + ML::POP_RANGE(); + ML::PUSH_RANGE("gather phase @levelfunc_regressor.cuh"); // Start of gather algorithm //Convertor int lastsize = sparsetree.size() - sparsesize_nextitr; n_nodes = n_nodes_nextitr; - if (n_nodes == 0) return; + if (n_nodes == 0) { + ML::POP_RANGE(); // gather pahse ended + ML::POP_RANGE(); // grow_deep_tree_classification end + return; + } unsigned int *d_nodecount, *d_samplelist, *d_nodestart; SparseTreeNode* d_sparsenodes; SparseTreeNode* h_sparsenodes; @@ -244,6 +252,10 @@ void grow_deep_tree_regression( sparsetree.insert(sparsetree.end(), h_sparsenodes, h_sparsenodes + lastsize); } + + ML::POP_RANGE(); // gather phase @levelfunc_regressor.cuh + + ML::POP_RANGE(); // grow_deep_tree_classification } } // namespace DecisionTree diff --git a/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh b/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh index 51cf5b7d10..3cd5b762d5 100644 --- a/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh +++ b/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -17,6 +17,8 @@ #include #include "levelkernel_classifier.cuh" +#include + namespace ML { namespace DecisionTree { @@ -25,6 +27,8 @@ void initial_metric_classification( const int *labels, unsigned int *sample_cnt, const int nrows, const int n_unique_labels, std::vector &histvec, T &initial_metric, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTree::initial_metric_classification @levelhelper_classifier.cuh"); CUDA_CHECK(cudaMemsetAsync(tempmem->d_parent_hist->data(), 0, n_unique_labels * sizeof(unsigned int), tempmem->stream)); @@ -41,6 +45,7 @@ void initial_metric_classification( histvec.assign(tempmem->h_parent_hist->data(), tempmem->h_parent_hist->data() + n_unique_labels); initial_metric = F::exec(histvec, nrows); + ML::POP_RANGE(); } template @@ -50,6 +55,9 @@ void get_histogram_classification( const int ncols_sampled, const int n_unique_labels, const int nbins, const int n_nodes, const int split_algo, std::shared_ptr> tempmem, unsigned int *histout) { + ML::PUSH_RANGE( + "DecisionTree::get_histogram_classification @levelhelper_classifier.cuh"); + size_t histcount = ncols_sampled * nbins * n_unique_labels * n_nodes; CUDA_CHECK(cudaMemsetAsync(histout, 0, histcount * sizeof(unsigned int), tempmem->stream)); @@ -94,6 +102,7 @@ void get_histogram_classification( } } CUDA_CHECK(cudaGetLastError()); + ML::POP_RANGE(); } template void get_best_split_classification( @@ -107,6 +116,7 @@ void get_best_split_classification( std::vector &sparse_nodelist, int *split_colidx, int *split_binidx, int *d_split_colidx, int *d_split_binidx, std::shared_ptr> tempmem) { + ML::PUSH_RANGE("get_best_split_classification @levelhelper_classifier.cuh"); T *quantile = nullptr; T *minmax = nullptr; if (tempmem->h_quantile != nullptr) quantile = tempmem->h_quantile->data(); @@ -284,6 +294,7 @@ void get_best_split_classification( raft::update_device(d_split_binidx, split_binidx, n_nodes, tempmem->stream); raft::update_device(d_split_colidx, split_colidx, n_nodes, tempmem->stream); } + ML::POP_RANGE(); } template diff --git a/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh index 48c424ba6a..b0ee921bb4 100644 --- a/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh +++ b/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -25,6 +25,8 @@ void initial_metric_regression(const T *labels, unsigned int *sample_cnt, const int nrows, T &mean, unsigned int &count, T &initial_metric, std::shared_ptr> tempmem) { + ML::PUSH_RANGE( + "DecisionTree::initial_metric_classification @levelhelper_regressor.cuh"); CUDA_CHECK( cudaMemsetAsync(tempmem->d_mseout->data(), 0, sizeof(T), tempmem->stream)); CUDA_CHECK( @@ -52,6 +54,7 @@ void initial_metric_regression(const T *labels, unsigned int *sample_cnt, count = tempmem->h_count->data()[0]; mean = tempmem->h_predout->data()[0] / count; initial_metric = tempmem->h_mseout->data()[0] / count; + ML::POP_RANGE(); } template @@ -63,6 +66,8 @@ void get_mse_regression_fused(const T *data, const T *labels, std::shared_ptr> tempmem, T *d_mseout, T *d_predout, unsigned int *d_count) { + ML::PUSH_RANGE( + "DecisionTree::get_mse_regression_fused @levelhelper_regressor.cuh"); size_t predcount = ncols_sampled * nbins * n_nodes; CUDA_CHECK( cudaMemsetAsync(d_mseout, 0, 2 * predcount * sizeof(T), tempmem->stream)); @@ -114,6 +119,7 @@ void get_mse_regression_fused(const T *data, const T *labels, } CUDA_CHECK(cudaGetLastError()); } + ML::POP_RANGE(); } template void get_mse_regression(const T *data, const T *labels, unsigned int *flags, @@ -123,6 +129,7 @@ void get_mse_regression(const T *data, const T *labels, unsigned int *flags, const int split_algo, std::shared_ptr> tempmem, T *d_mseout, T *d_predout, unsigned int *d_count) { + ML::PUSH_RANGE("DecisionTree::get_mse_regression @levelhelper_regressor.cuh"); size_t predcount = ncols_sampled * nbins * n_nodes; CUDA_CHECK( cudaMemsetAsync(d_mseout, 0, 2 * predcount * sizeof(T), tempmem->stream)); @@ -209,6 +216,7 @@ void get_mse_regression(const T *data, const T *labels, unsigned int *flags, } CUDA_CHECK(cudaGetLastError()); } + ML::POP_RANGE(); } template void get_best_split_regression( @@ -223,6 +231,7 @@ void get_best_split_regression( std::vector &sparse_nodelist, int *split_colidx, int *split_binidx, int *d_split_colidx, int *d_split_binidx, std::shared_ptr> tempmem) { + ML::PUSH_RANGE("get_best_split_regression @levelhelper_regressor.cuh"); T *quantile = nullptr; T *minmax = nullptr; if (tempmem->h_quantile != nullptr) quantile = tempmem->h_quantile->data(); @@ -395,6 +404,7 @@ void get_best_split_regression( raft::update_device(d_split_binidx, split_binidx, n_nodes, tempmem->stream); raft::update_device(d_split_colidx, split_colidx, n_nodes, tempmem->stream); } + ML::POP_RANGE(); } template @@ -406,6 +416,7 @@ void leaf_eval_regression(float *gain, int curr_depth, const int sparsesize, std::vector &sparse_mean, int &n_nodes_next, std::vector &sparse_nodelist, int &tree_leaf_cnt) { + ML::PUSH_RANGE("leaf_eval_regression @levelhelper_regressor.cuh"); std::vector tmp_sparse_nodelist(sparse_nodelist); sparse_nodelist.clear(); @@ -434,6 +445,7 @@ void leaf_eval_regression(float *gain, int curr_depth, int nleafed = tmp_sparse_nodelist.size() - non_leaf_counter; tree_leaf_cnt += nleafed; n_nodes_next = 2 * non_leaf_counter; + ML::POP_RANGE(); } template @@ -466,6 +478,7 @@ void best_split_gather_regression( const float min_impurity_split, std::shared_ptr> tempmem, SparseTreeNode *d_sparsenodes, int *d_nodelist) { + ML::PUSH_RANGE("get_best_split_gather_regression @levelhelper_regressor.cuh"); const int TPB = TemporaryMemory::gather_threads; if (split_cr == ML::CRITERION::MSE) { if (split_algo == ML::SPLIT_ALGO::HIST) { @@ -508,6 +521,7 @@ void best_split_gather_regression( } CUDA_CHECK(cudaGetLastError()); } + ML::POP_RANGE(); } template void make_leaf_gather_regression( diff --git a/cpp/src/decisiontree/quantile/quantile.cuh b/cpp/src/decisiontree/quantile/quantile.cuh index cf3a61d2ec..ae651074b6 100644 --- a/cpp/src/decisiontree/quantile/quantile.cuh +++ b/cpp/src/decisiontree/quantile/quantile.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,8 @@ #include #include "quantile.h" +#include + namespace ML { namespace DecisionTree { @@ -83,7 +85,9 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, int max_ncols = free_mem / (2 * n_sampled_rows * sizeof(T)); int batch_cols = (max_ncols > ncols) ? ncols : max_ncols; ASSERT(max_ncols != 0, "Cannot preprocess quantiles due to insufficient device memory."); - */ + */ + + ML::PUSH_RANGE("preprocessing quantile @quantile.cuh"); int batch_cols = 1; // Processing one column at a time, for now, until an appropriate getMemInfo function is provided for the deviceAllocator interface. @@ -109,8 +113,10 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, tempmem->stream, batch_cols + 1); blocks = raft::ceildiv(batch_cols + 1, threads); + ML::PUSH_RANGE("set_sorting_offset kernel @quantile.cuh"); set_sorting_offset<<stream>>>( n_sampled_rows, batch_cols, d_offsets->data()); + ML::POP_RANGE(); CUDA_CHECK(cudaGetLastError()); // Determine temporary device storage requirements @@ -126,15 +132,18 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, d_keys_out = new MLCommon::device_buffer(tempmem->device_allocator, tempmem->stream, batch_items); - + ML::PUSH_RANGE( + "DecisionTree::cub::DeviceRadixSort::SortKeys over batch_items " + "@quantile.cuh"); CUDA_CHECK(cub::DeviceRadixSort::SortKeys( d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out->data(), batch_items, 0, 8 * sizeof(T), tempmem->stream)); - + ML::POP_RANGE(); // Allocate temporary storage d_temp_storage = new MLCommon::device_buffer( tempmem->device_allocator, tempmem->stream, temp_storage_bytes); + ML::PUSH_RANGE("iterative quantile computation for each batch"); // Compute quantiles for cur_batch_cols columns per loop iteration. for (int batch = 0; batch < batch_cnt; batch++) { int cur_batch_cols = (batch == batch_cnt - 1) @@ -143,20 +152,24 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, int batch_offset = batch * n_sampled_rows * batch_cols; int quantile_offset = batch * nbins * batch_cols; - + ML::PUSH_RANGE("DeviceRadixSort::SortKeys"); CUDA_CHECK(cub::DeviceRadixSort::SortKeys( (void *)d_temp_storage->data(), temp_storage_bytes, &d_keys_in[batch_offset], d_keys_out->data(), n_sampled_rows, 0, 8 * sizeof(T), tempmem->stream)); + ML::POP_RANGE(); blocks = raft::ceildiv(cur_batch_cols * nbins, threads); + ML::PUSH_RANGE("get_all_quantiles kernel @quantile.cuh"); get_all_quantiles<<stream>>>( d_keys_out->data(), &tempmem->d_quantile->data()[quantile_offset], n_sampled_rows, cur_batch_cols, nbins); + ML::POP_RANGE(); CUDA_CHECK(cudaGetLastError()); CUDA_CHECK(cudaStreamSynchronize(tempmem->stream)); } + ML::POP_RANGE(); raft::update_host(tempmem->h_quantile->data(), tempmem->d_quantile->data(), nbins * ncols, tempmem->stream); d_keys_out->release(tempmem->stream); @@ -165,6 +178,7 @@ void preprocess_quantile(const T *data, const unsigned int *rowids, delete d_keys_out; delete d_offsets; delete d_temp_storage; + ML::POP_RANGE(); return; } diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu index 4ec5bfb2a2..632e9a485d 100644 --- a/cpp/src/randomforest/randomforest.cu +++ b/cpp/src/randomforest/randomforest.cu @@ -524,6 +524,7 @@ ModelHandle concatenate_trees(std::vector treelite_handles) { void fit(const raft::handle_t& user_handle, RandomForestClassifierF*& forest, float* input, int n_rows, int n_cols, int* labels, int n_unique_labels, RF_params rf_params, int verbosity) { + ML::PUSH_RANGE("RF::fit @randomforest.cu"); ML::Logger::get().setLevel(verbosity); ASSERT(!forest->trees, "Cannot fit an existing forest."); forest->trees = @@ -534,11 +535,13 @@ void fit(const raft::handle_t& user_handle, RandomForestClassifierF*& forest, std::make_shared>(rf_params); rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest); + ML::POP_RANGE(); } void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest, double* input, int n_rows, int n_cols, int* labels, int n_unique_labels, RF_params rf_params, int verbosity) { + ML::PUSH_RANGE("RF::fit @randomforest.cu"); ML::Logger::get().setLevel(verbosity); ASSERT(!forest->trees, "Cannot fit an existing forest."); forest->trees = @@ -549,6 +552,7 @@ void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest, std::make_shared>(rf_params); rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest); + ML::POP_RANGE(); } /** @} */ @@ -693,6 +697,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features, void fit(const raft::handle_t& user_handle, RandomForestRegressorF*& forest, float* input, int n_rows, int n_cols, float* labels, RF_params rf_params, int verbosity) { + ML::PUSH_RANGE("RF::fit @randomforest.cu"); ML::Logger::get().setLevel(verbosity); ASSERT(!forest->trees, "Cannot fit an existing forest."); forest->trees = @@ -702,11 +707,13 @@ void fit(const raft::handle_t& user_handle, RandomForestRegressorF*& forest, std::shared_ptr> rf_regressor = std::make_shared>(rf_params); rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, forest); + ML::POP_RANGE(); } void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest, double* input, int n_rows, int n_cols, double* labels, RF_params rf_params, int verbosity) { + ML::PUSH_RANGE("RF::fit @randomforest.cu"); ML::Logger::get().setLevel(verbosity); ASSERT(!forest->trees, "Cannot fit an existing forest."); forest->trees = @@ -716,6 +723,7 @@ void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest, std::shared_ptr> rf_regressor = std::make_shared>(rf_params); rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, forest); + ML::POP_RANGE(); } /** @} */ diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh index 04a0634749..091996f2cb 100644 --- a/cpp/src/randomforest/randomforest_impl.cuh +++ b/cpp/src/randomforest/randomforest_impl.cuh @@ -27,6 +27,8 @@ #include #include "randomforest_impl.h" +#include + namespace ML { /** * @brief Construct rf (random forest) object. @@ -68,6 +70,7 @@ void rf::prepare_fit_per_tree( int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows, const int num_sms, const cudaStream_t stream, const std::shared_ptr device_allocator) { + ML::PUSH_RANGE("bootstrapping row IDs @randomforest_impl.cuh"); int rs = tree_id; if (rf_params.seed != 0) rs = rf_params.seed + tree_id; @@ -82,6 +85,7 @@ void rf::prepare_fit_per_tree( thrust::sequence(thrust::cuda::par.on(stream), selected_rows, selected_rows + n_sampled_rows); } + ML::POP_RANGE(); } template @@ -153,6 +157,7 @@ void rfClassifier::fit(const raft::handle_t& user_handle, const T* input, int n_rows, int n_cols, int* labels, int n_unique_labels, RandomForestMetaData*& forest) { + ML::PUSH_RANGE("rfClassifer::fit @randomforest_impl.cuh"); this->error_checking(input, labels, n_rows, n_cols, false); const raft::handle_t& handle = user_handle; @@ -243,6 +248,8 @@ void rfClassifier::fit(const raft::handle_t& user_handle, const T* input, } CUDA_CHECK(cudaStreamSynchronize(user_handle.get_stream())); + + ML::POP_RANGE(); } /** @@ -433,6 +440,7 @@ template void rfRegressor::fit(const raft::handle_t& user_handle, const T* input, int n_rows, int n_cols, T* labels, RandomForestMetaData*& forest) { + ML::PUSH_RANGE("rfRegressor::fit @randomforest_impl.cuh"); this->error_checking(input, labels, n_rows, n_cols, false); const raft::handle_t& handle = user_handle; @@ -519,6 +527,8 @@ void rfRegressor::fit(const raft::handle_t& user_handle, const T* input, } CUDA_CHECK(cudaStreamSynchronize(handle.get_stream())); + + ML::POP_RANGE(); } /** diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx index c21ed72202..39908aa60c 100644 --- a/python/cuml/ensemble/randomforestclassifier.pyx +++ b/python/cuml/ensemble/randomforestclassifier.pyx @@ -47,6 +47,7 @@ from libc.stdlib cimport calloc, malloc, free from numba import cuda +from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop from cuml.raft.common.handle cimport handle_t cimport cuml.common.cuda @@ -439,6 +440,8 @@ class RandomForestClassifier(BaseRandomForestModel, y to be of dtype int32. This will increase memory used for the method. """ + nvtx_range_push("Fit RF-Classifier @randomforestclassifier.pyx") + X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, convert_dtype) cdef uintptr_t X_ptr, y_ptr @@ -512,6 +515,7 @@ class RandomForestClassifier(BaseRandomForestModel, self.handle.sync() del X_m del y_m + nvtx_range_pop() return self @cuml.internals.api_base_return_array(get_output_dtype=True) @@ -626,6 +630,7 @@ class RandomForestClassifier(BaseRandomForestModel, ---------- y : {} """ + nvtx_range_push("predict RF-Classifier @randomforestclassifier.pyx") if num_classes: warnings.warn("num_classes is deprecated and will be removed" " in an upcoming version") @@ -653,6 +658,7 @@ class RandomForestClassifier(BaseRandomForestModel, fil_sparse_format=fil_sparse_format, predict_proba=False) + nvtx_range_pop() return preds def _predict_get_all(self, X, convert_dtype=True) -> CumlArray: @@ -859,6 +865,8 @@ class RandomForestClassifier(BaseRandomForestModel, accuracy : float Accuracy of the model [0.0 - 1.0] """ + + nvtx_range_push("score RF-Classifier @randomforestclassifier.pyx") cdef uintptr_t X_ptr, y_ptr _, n_rows, _, _ = \ input_to_cuml_array(X, check_dtype=self.dtype, @@ -913,6 +921,7 @@ class RandomForestClassifier(BaseRandomForestModel, self.handle.sync() del(y_m) del(preds_m) + nvtx_range_pop() return self.stats['accuracy'] def get_summary_text(self): diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx index 39ab17cde8..e7cadb7eb3 100644 --- a/python/cuml/ensemble/randomforestregressor.pyx +++ b/python/cuml/ensemble/randomforestregressor.pyx @@ -47,6 +47,7 @@ from libc.stdlib cimport calloc, malloc, free from numba import cuda +from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop from cuml.raft.common.handle cimport handle_t cimport cuml.common.cuda @@ -419,6 +420,8 @@ class RandomForestRegressor(BaseRandomForestModel, Perform Random Forest Regression on the input data """ + nvtx_range_push("Fit RF-Regressor @randomforestregressor.pyx") + X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y, convert_dtype) @@ -485,6 +488,7 @@ class RandomForestRegressor(BaseRandomForestModel, self.handle.sync() del X_m del y_m + nvtx_range_pop() return self def _predict_model_on_cpu(self, X, convert_dtype) -> CumlArray: @@ -579,6 +583,7 @@ class RandomForestRegressor(BaseRandomForestModel, y : {} """ + nvtx_range_push("predict RF-Regressor @randomforestregressor.pyx") if predict_model == "CPU": preds = self._predict_model_on_cpu(X, convert_dtype) @@ -597,6 +602,7 @@ class RandomForestRegressor(BaseRandomForestModel, convert_dtype=convert_dtype, fil_sparse_format=fil_sparse_format) + nvtx_range_pop() return preds @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)'), @@ -645,6 +651,7 @@ class RandomForestRegressor(BaseRandomForestModel, median_abs_error : float or mean_abs_error : float """ + nvtx_range_push("score RF-Regressor @randomforestregressor.pyx") from cuml.metrics.regression import r2_score cdef uintptr_t y_ptr @@ -710,6 +717,7 @@ class RandomForestRegressor(BaseRandomForestModel, self.handle.sync() del(y_m) del(preds_m) + nvtx_range_pop() return stats def get_summary_text(self):