From 8fa2b9067e67af4d0663da9880683f3ec24a3d04 Mon Sep 17 00:00:00 2001
From: Venkat <gvenkatarama@nvidia.com>
Date: Thu, 25 Feb 2021 23:08:57 +0530
Subject: [PATCH] NVTX Markers for RF and RF-backend (#3014)

* This PR adds NVTX Markers to major time-consuming function calls of the regressors and classifiers of RF and DecisionTrees.
* They span both RandomForest and DecisionTree code-bases

Authors:
  - Venkat (@venkywonka)
  - John Zedlewski (@JohnZed)

Approvers:
  - Thejaswi. N. S (@teju85)
  - AJ Schmidt (@ajschmidt8)
  - John Zedlewski (@JohnZed)

URL: https://github.com/rapidsai/cuml/pull/3014
---
 .../batched-levelalgo/builder.cuh             |  4 +++
 .../batched-levelalgo/builder_base.cuh        | 28 ++++++++++++++++++-
 cpp/src/decisiontree/decisiontree_impl.cuh    | 13 ++++++++-
 .../decisiontree/levelalgo/common_helper.cuh  |  9 +++++-
 .../levelalgo/levelfunc_classifier.cuh        | 19 +++++++++++--
 .../levelalgo/levelfunc_regressor.cuh         | 18 ++++++++++--
 .../levelalgo/levelhelper_classifier.cuh      | 13 ++++++++-
 .../levelalgo/levelhelper_regressor.cuh       | 16 ++++++++++-
 cpp/src/decisiontree/quantile/quantile.cuh    | 24 ++++++++++++----
 cpp/src/randomforest/randomforest.cu          |  8 ++++++
 cpp/src/randomforest/randomforest_impl.cuh    | 10 +++++++
 .../cuml/ensemble/randomforestclassifier.pyx  |  9 ++++++
 .../cuml/ensemble/randomforestregressor.pyx   |  8 ++++++
 13 files changed, 164 insertions(+), 15 deletions(-)
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 5a7aa9ab35..6a42e17115 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -21,6 +21,8 @@
 
 #include "builder_base.cuh"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -51,6 +53,7 @@ void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
                const DecisionTreeParams& params, cudaStream_t stream,
                std::vector<SparseTreeNode<DataT, LabelT>>& sparsetree,
                IdxT& num_leaves, IdxT& depth) {
+  ML::PUSH_RANGE("DecisionTree::grow_tree in batched-levelalgo @builder.cuh");
   Builder<Traits> builder;
   size_t d_wsize, h_wsize;
   builder.workspaceSize(d_wsize, h_wsize, treeid, seed, params, data, labels,
@@ -68,6 +71,7 @@ void grow_tree(std::shared_ptr<MLCommon::deviceAllocator> d_allocator,
   d_buff.release(stream);
   h_buff.release(stream);
   convertToSparse<Traits>(builder, h_nodes.data(), sparsetree);
+  ML::POP_RANGE();
 }
 
 /**
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
index 91036b2d47..9f537d5a63 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder_base.cuh
@@ -25,6 +25,8 @@
 #include "node.cuh"
 #include "split.cuh"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -139,6 +141,8 @@ struct Builder {
                      const DataT* data, const LabelT* labels, IdxT totalRows,
                      IdxT totalCols, IdxT sampledRows, IdxT sampledCols,
                      IdxT* rowids, IdxT nclasses, const DataT* quantiles) {
+    ML::PUSH_RANGE(
+      "Builder::workspaceSize @builder_base.cuh [batched-levelalgo]");
     ASSERT(quantiles != nullptr,
            "Currently quantiles need to be computed before this call!");
     params = p;
@@ -200,6 +204,7 @@ struct Builder {
       calculateAlignedBytes(sizeof(NodeT) * 2 * max_batch);  // next_nodes
     // all nodes in the tree
     h_wsize = calculateAlignedBytes(sizeof(IdxT));  // h_n_nodes
+    ML::POP_RANGE();
   }
 
   /**
@@ -210,6 +215,8 @@ struct Builder {
    * @param[in] h_wspace pinned host buffer needed to store the learned nodes
    */
   void assignWorkspace(char* d_wspace, char* h_wspace) {
+    ML::PUSH_RANGE(
+      "Builder::assignWorkspace @builder_base.cuh [batched-levelalgo]");
     auto max_batch = params.max_batch_size;
     auto n_col_blks = n_blks_for_cols;
     // device
@@ -245,6 +252,7 @@ struct Builder {
     next_nodes = reinterpret_cast<NodeT*>(d_wspace);
     // host
     h_n_nodes = reinterpret_cast<IdxT*>(h_wspace);
+    ML::POP_RANGE();
   }
 
   /**
@@ -258,6 +266,7 @@ struct Builder {
    */
   void train(std::vector<Node<DataT, LabelT, IdxT>>& h_nodes, IdxT& num_leaves,
              IdxT& depth, cudaStream_t s) {
+    ML::PUSH_RANGE("Builder::train @builder_base.cuh [batched-levelalgo]");
     init(h_nodes, s);
     while (true) {
       IdxT new_nodes = doSplit(h_nodes, s);
@@ -267,6 +276,7 @@ struct Builder {
     }
     raft::update_host(&num_leaves, n_leaves, 1, s);
     raft::update_host(&depth, n_depth, 1, s);
+    ML::POP_RANGE();
   }
 
  private:
@@ -322,6 +332,7 @@ struct Builder {
    */
   IdxT doSplit(std::vector<Node<DataT, LabelT, IdxT>>& h_nodes,
                cudaStream_t s) {
+    ML::PUSH_RANGE("Builder::doSplit @bulder_base.cuh [batched-levelalgo]");
     auto batchSize = node_end - node_start;
     // start fresh on the number of *new* nodes created in this batch
     CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), s));
@@ -338,6 +349,7 @@ struct Builder {
     }
     // create child nodes (or make the current ones leaf)
     auto smemSize = Traits::nodeSplitSmemSize(*this);
+    ML::PUSH_RANGE("nodeSplitKernel @builder_base.cuh [batched-levelalgo]");
     nodeSplitKernel<DataT, LabelT, IdxT, typename Traits::DevTraits,
                     Traits::TPB_SPLIT>
       <<<batchSize, Traits::TPB_SPLIT, smemSize, s>>>(
@@ -345,6 +357,7 @@ struct Builder {
         params.max_leaves, params.min_impurity_decrease, input, curr_nodes,
         next_nodes, n_nodes, splits, n_leaves, h_total_nodes, n_depth);
     CUDA_CHECK(cudaGetLastError());
+    ML::POP_RANGE();
     // copy the updated (due to leaf creation) and newly created child nodes
     raft::update_host(h_n_nodes, n_nodes, 1, s);
     CUDA_CHECK(cudaStreamSynchronize(s));
@@ -352,6 +365,7 @@ struct Builder {
     raft::update_host(h_nodes.data() + node_start, curr_nodes, batchSize, s);
     raft::update_host(h_nodes.data() + h_total_nodes, next_nodes, *h_n_nodes,
                       s);
+    ML::POP_RANGE();
     return *h_n_nodes;
   }
 };  // end Builder
@@ -391,6 +405,8 @@ struct ClsTraits {
   static void computeSplit(Builder<ClsTraits<DataT, LabelT, IdxT>>& b, IdxT col,
                            IdxT batchSize, CRITERION splitType,
                            cudaStream_t s) {
+    ML::PUSH_RANGE(
+      "Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
     auto nbins = b.params.n_bins;
     auto nclasses = b.input.nclasses;
     auto binSize = nbins * 2 * nclasses;
@@ -403,12 +419,16 @@ struct ClsTraits {
     smemSize += 2 * sizeof(DataT) + 1 * sizeof(int);
 
     CUDA_CHECK(cudaMemsetAsync(b.hist, 0, sizeof(int) * b.nHistBins, s));
+    ML::PUSH_RANGE(
+      "computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]");
     computeSplitClassificationKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
       <<<grid, TPB_DEFAULT, smemSize, s>>>(
         b.hist, b.params.n_bins, b.params.max_depth, b.params.min_samples_split,
         b.params.min_samples_leaf, b.params.min_impurity_decrease,
         b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex,
         b.n_leaves, b.splits, splitType, b.treeid, b.seed);
+    ML::POP_RANGE();  //computeSplitClassificationKernel
+    ML::POP_RANGE();  //Builder::computeSplit
   }
 
   /**
@@ -460,8 +480,9 @@ struct RegTraits {
   static void computeSplit(Builder<RegTraits<DataT, IdxT>>& b, IdxT col,
                            IdxT batchSize, CRITERION splitType,
                            cudaStream_t s) {
+    ML::PUSH_RANGE(
+      "Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
     auto n_col_blks = std::min(b.n_blks_for_cols, b.input.nSampledCols - col);
-
     dim3 grid(b.n_blks_for_rows, n_col_blks, batchSize);
     auto nbins = b.params.n_bins;
     size_t smemSize = 7 * nbins * sizeof(DataT) + nbins * sizeof(int);
@@ -478,6 +499,9 @@ struct RegTraits {
     CUDA_CHECK(cudaMemsetAsync(b.pred2P, 0, sizeof(DataT) * b.nPredCounts, s));
     CUDA_CHECK(
       cudaMemsetAsync(b.pred_count, 0, sizeof(IdxT) * b.nPredCounts, s));
+
+    ML::PUSH_RANGE(
+      "computeSplitRegressionKernel @builder_base.cuh [batched-levelalgo]");
     computeSplitRegressionKernel<DataT, DataT, IdxT, TPB_DEFAULT>
       <<<grid, TPB_DEFAULT, smemSize, s>>>(
         b.pred, b.pred2, b.pred2P, b.pred_count, b.params.n_bins,
@@ -485,6 +509,8 @@ struct RegTraits {
         b.params.min_samples_leaf, b.params.min_impurity_decrease,
         b.params.max_leaves, b.input, b.curr_nodes, col, b.done_count, b.mutex,
         b.n_leaves, b.splits, b.block_sync, splitType, b.treeid, b.seed);
+    ML::POP_RANGE();  //computeSplitRegressionKernel
+    ML::POP_RANGE();  //Builder::computeSplit
   }
 
   /**
diff --git a/cpp/src/decisiontree/decisiontree_impl.cuh b/cpp/src/decisiontree/decisiontree_impl.cuh
index fd9a717569..ce0b23fab4 100644
--- a/cpp/src/decisiontree/decisiontree_impl.cuh
+++ b/cpp/src/decisiontree/decisiontree_impl.cuh
@@ -33,6 +33,8 @@
 #include "quantile/quantile.h"
 #include "treelite_util.h"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 
 bool is_dev_ptr(const void *p) {
@@ -262,6 +264,7 @@ void DecisionTreeBase<T, L>::plant(
   const int nrows, const L *labels, unsigned int *rowids,
   const int n_sampled_rows, int unique_labels, const int treeid,
   uint64_t seed) {
+  ML::PUSH_RANGE("DecisionTreeBase::plant @decisiontree_impl.cuh");
   dinfo.NLocalrows = nrows;
   dinfo.NGlobalrows = nrows;
   dinfo.Ncols = ncols;
@@ -274,7 +277,7 @@ void DecisionTreeBase<T, L>::plant(
   }
   CUDA_CHECK(cudaStreamSynchronize(
     tempmem->stream));  // added to ensure accurate measurement
-
+  ML::PUSH_RANGE("DecisionTreeBase::plant::bootstrapping features");
   //Bootstrap features
   unsigned int *h_colids = tempmem->h_colids->data();
   if (tree_params.bootstrap_features) {
@@ -285,6 +288,7 @@ void DecisionTreeBase<T, L>::plant(
   } else {
     std::iota(h_colids, h_colids + dinfo.Ncols, 0);
   }
+  ML::POP_RANGE();
   prepare_time = prepare_fit_timer.getElapsedSeconds();
 
   total_temp_mem = tempmem->totalmem;
@@ -304,6 +308,7 @@ void DecisionTreeBase<T, L>::plant(
                    treeid, tempmem);
   }
   train_time = timer.getElapsedSeconds();
+  ML::POP_RANGE();
 }
 
 template <typename T, typename L>
@@ -484,6 +489,8 @@ void DecisionTreeClassifier<T>::grow_deep_tree(
   const int n_sampled_rows, const int ncols, const float colper,
   const int nrows, std::vector<SparseTreeNode<T, int>> &sparsetree,
   const int treeid, std::shared_ptr<TemporaryMemory<T, int>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTreeClassifier::grow_deep_tree @decisiontree_impl.cuh");
   int leaf_cnt = 0;
   int depth_cnt = 0;
   grow_deep_tree_classification(data, labels, rowids, ncols, colper,
@@ -492,6 +499,7 @@ void DecisionTreeClassifier<T>::grow_deep_tree(
                                 sparsetree, treeid, tempmem);
   this->depth_counter = depth_cnt;
   this->leaf_counter = leaf_cnt;
+  ML::POP_RANGE();
 }
 
 template <typename T>
@@ -500,6 +508,8 @@ void DecisionTreeRegressor<T>::grow_deep_tree(
   const int n_sampled_rows, const int ncols, const float colper,
   const int nrows, std::vector<SparseTreeNode<T, T>> &sparsetree,
   const int treeid, std::shared_ptr<TemporaryMemory<T, T>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTreeRegressor::grow_deep_tree @decisiontree_impl.cuh");
   int leaf_cnt = 0;
   int depth_cnt = 0;
   grow_deep_tree_regression(data, labels, rowids, ncols, colper, n_sampled_rows,
@@ -507,6 +517,7 @@ void DecisionTreeRegressor<T>::grow_deep_tree(
                             sparsetree, treeid, tempmem);
   this->depth_counter = depth_cnt;
   this->leaf_counter = leaf_cnt;
+  ML::POP_RANGE();
 }
 
 //Class specializations
diff --git a/cpp/src/decisiontree/levelalgo/common_helper.cuh b/cpp/src/decisiontree/levelalgo/common_helper.cuh
index 5a028817eb..6795e00980 100644
--- a/cpp/src/decisiontree/levelalgo/common_helper.cuh
+++ b/cpp/src/decisiontree/levelalgo/common_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #include <stats/minmax.cuh>
 #include "common_kernel.cuh"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -38,6 +40,8 @@ void update_feature_sampling(unsigned int *h_colids, unsigned int *d_colids,
                              std::vector<unsigned int> &feature_selector,
                              std::shared_ptr<TemporaryMemory<T, L>> tempmem,
                              raft::random::Rng &d_rng) {
+  ML::PUSH_RANGE(
+    "update_feature_sampling @common_helper.cuh (does feature subsampling)");
   if (h_colstart != nullptr) {
     if (Ncols != ncols_sampled) {
       std::shuffle(h_colids, h_colids + Ncols, rng);
@@ -63,6 +67,7 @@ void update_feature_sampling(unsigned int *h_colids, unsigned int *d_colids,
     raft::update_device(d_colids, h_colids, ncols_sampled * n_nodes,
                         tempmem->stream);
   }
+  ML::POP_RANGE();
 }
 
 //This function calcualtes min/max from the samples that belong in a given node. This is done for all the nodes at a given level
@@ -104,6 +109,7 @@ void get_minmax(const T *data, const unsigned int *flags,
 void setup_sampling(unsigned int *flagsptr, unsigned int *sample_cnt,
                     const unsigned int *rowids, const int nrows,
                     const int n_sampled_rows, cudaStream_t &stream) {
+  ML::PUSH_RANGE("DecisionTree::setup_sampling @common_helper.cuh");
   CUDA_CHECK(cudaMemsetAsync(sample_cnt, 0, nrows * sizeof(int), stream));
   int threads = 256;
   int blocks = raft::ceildiv(n_sampled_rows, threads);
@@ -114,6 +120,7 @@ void setup_sampling(unsigned int *flagsptr, unsigned int *sample_cnt,
   setup_flags_kernel<<<blocks, threads, 0, stream>>>(sample_cnt, flagsptr,
                                                      nrows);
   CUDA_CHECK(cudaGetLastError());
+  ML::POP_RANGE();  //setup_sampling @common_helper.cuh
 }
 
 //This function call the split kernel
diff --git a/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh b/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh
index e41d495224..d33c82184e 100644
--- a/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh
+++ b/cpp/src/decisiontree/levelalgo/levelfunc_classifier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include "levelhelper_classifier.cuh"
 #include "metric.cuh"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -45,6 +47,8 @@ void grow_deep_tree_classification(
   const ML::DecisionTree::DecisionTreeParams& tree_params, int& depth_cnt,
   int& leaf_cnt, std::vector<SparseTreeNode<T, int>>& sparsetree,
   const int treeid, std::shared_ptr<TemporaryMemory<T, int>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTree::grow_deep_tree_classification @levelfunc_classifier.cuh");
   const int ncols_sampled = (int)(colper * Ncols);
   unsigned int* flagsptr = tempmem->d_flags->data();
   unsigned int* sample_cnt = tempmem->d_sample_cnt->data();
@@ -111,6 +115,7 @@ void grow_deep_tree_classification(
 
   int scatter_algo_depth =
     std::min(tempmem->swap_depth, tree_params.max_depth + 1);
+  ML::PUSH_RANGE("scatter phase @levelfunc_classifier");
   for (int depth = 0; (depth < scatter_algo_depth) && (n_nodes_nextitr != 0);
        depth++) {
     depth_cnt = depth;
@@ -166,12 +171,19 @@ void grow_deep_tree_classification(
              2 * n_nodes * n_unique_labels * sizeof(unsigned int));
     }
   }
+  ML::POP_RANGE();  //scatter phase @levelfunc_classifier.cuh
+
+  ML::PUSH_RANGE("gather phase @levelfunc_classifier.cuh");
   // Start of gather algorithm
   //Convertor
   CUML_LOG_DEBUG("begin gather ");
   int lastsize = sparsetree.size() - sparsesize_nextitr;
   n_nodes = n_nodes_nextitr;
-  if (n_nodes == 0) return;
+  if (n_nodes == 0) {
+    ML::POP_RANGE();  //gather phase ended
+    ML::POP_RANGE();  //grow_deep_tree_classification end
+    return;
+  }
   unsigned int *d_nodecount, *d_samplelist, *d_nodestart;
   SparseTreeNode<T, int>* d_sparsenodes;
   SparseTreeNode<T, int>* h_sparsenodes;
@@ -250,6 +262,9 @@ void grow_deep_tree_classification(
     sparsetree.insert(sparsetree.end(), h_sparsenodes,
                       h_sparsenodes + lastsize);
   }
+
+  ML::POP_RANGE();  //gather phase @levelfunc_classifier.cuh
+  ML::POP_RANGE();  //grow_deep_tree_classification @levelfunc_classifier.cuh
 }
 
 }  // namespace DecisionTree
diff --git a/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh
index fd1e2fdd44..6a91229f34 100644
--- a/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh
+++ b/cpp/src/decisiontree/levelalgo/levelfunc_regressor.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -43,6 +43,8 @@ void grow_deep_tree_regression(
   const ML::DecisionTree::DecisionTreeParams& tree_params, int& depth_cnt,
   int& leaf_cnt, std::vector<SparseTreeNode<T, T>>& sparsetree,
   const int treeid, std::shared_ptr<TemporaryMemory<T, T>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTree::grow_deep_tree_classification @levelfunc_regressor.cuh");
   const int ncols_sampled = (int)(colper * Ncols);
   unsigned int* flagsptr = tempmem->d_flags->data();
   unsigned int* sample_cnt = tempmem->d_sample_cnt->data();
@@ -117,6 +119,7 @@ void grow_deep_tree_regression(
 
   int scatter_algo_depth =
     std::min(tempmem->swap_depth, tree_params.max_depth + 1);
+  ML::PUSH_RANGE("scatter phase @levelfunc_regressor");
   for (int depth = 0; (depth < scatter_algo_depth) && (n_nodes_nextitr != 0);
        depth++) {
     depth_cnt = depth;
@@ -173,13 +176,18 @@ void grow_deep_tree_regression(
                      n_nodes, tree_params.split_algo, d_split_colidx,
                      d_split_binidx, d_new_node_flags, flagsptr, tempmem);
   }
-
+  ML::POP_RANGE();
+  ML::PUSH_RANGE("gather phase @levelfunc_regressor.cuh");
   // Start of gather algorithm
   //Convertor
 
   int lastsize = sparsetree.size() - sparsesize_nextitr;
   n_nodes = n_nodes_nextitr;
-  if (n_nodes == 0) return;
+  if (n_nodes == 0) {
+    ML::POP_RANGE();  // gather pahse ended
+    ML::POP_RANGE();  // grow_deep_tree_classification end
+    return;
+  }
   unsigned int *d_nodecount, *d_samplelist, *d_nodestart;
   SparseTreeNode<T, T>* d_sparsenodes;
   SparseTreeNode<T, T>* h_sparsenodes;
@@ -244,6 +252,10 @@ void grow_deep_tree_regression(
     sparsetree.insert(sparsetree.end(), h_sparsenodes,
                       h_sparsenodes + lastsize);
   }
+
+  ML::POP_RANGE();  // gather phase @levelfunc_regressor.cuh
+
+  ML::POP_RANGE();  // grow_deep_tree_classification
 }
 
 }  // namespace DecisionTree
diff --git a/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh b/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh
index 51cf5b7d10..3cd5b762d5 100644
--- a/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh
+++ b/cpp/src/decisiontree/levelalgo/levelhelper_classifier.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/cudart_utils.h>
 #include "levelkernel_classifier.cuh"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -25,6 +27,8 @@ void initial_metric_classification(
   const int *labels, unsigned int *sample_cnt, const int nrows,
   const int n_unique_labels, std::vector<unsigned int> &histvec,
   T &initial_metric, std::shared_ptr<TemporaryMemory<T, int>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTree::initial_metric_classification @levelhelper_classifier.cuh");
   CUDA_CHECK(cudaMemsetAsync(tempmem->d_parent_hist->data(), 0,
                              n_unique_labels * sizeof(unsigned int),
                              tempmem->stream));
@@ -41,6 +45,7 @@ void initial_metric_classification(
   histvec.assign(tempmem->h_parent_hist->data(),
                  tempmem->h_parent_hist->data() + n_unique_labels);
   initial_metric = F::exec(histvec, nrows);
+  ML::POP_RANGE();
 }
 
 template <typename T>
@@ -50,6 +55,9 @@ void get_histogram_classification(
   const int ncols_sampled, const int n_unique_labels, const int nbins,
   const int n_nodes, const int split_algo,
   std::shared_ptr<TemporaryMemory<T, int>> tempmem, unsigned int *histout) {
+  ML::PUSH_RANGE(
+    "DecisionTree::get_histogram_classification @levelhelper_classifier.cuh");
+
   size_t histcount = ncols_sampled * nbins * n_unique_labels * n_nodes;
   CUDA_CHECK(cudaMemsetAsync(histout, 0, histcount * sizeof(unsigned int),
                              tempmem->stream));
@@ -94,6 +102,7 @@ void get_histogram_classification(
     }
   }
   CUDA_CHECK(cudaGetLastError());
+  ML::POP_RANGE();
 }
 template <typename T, typename F, typename DF>
 void get_best_split_classification(
@@ -107,6 +116,7 @@ void get_best_split_classification(
   std::vector<int> &sparse_nodelist, int *split_colidx, int *split_binidx,
   int *d_split_colidx, int *d_split_binidx,
   std::shared_ptr<TemporaryMemory<T, int>> tempmem) {
+  ML::PUSH_RANGE("get_best_split_classification @levelhelper_classifier.cuh");
   T *quantile = nullptr;
   T *minmax = nullptr;
   if (tempmem->h_quantile != nullptr) quantile = tempmem->h_quantile->data();
@@ -284,6 +294,7 @@ void get_best_split_classification(
     raft::update_device(d_split_binidx, split_binidx, n_nodes, tempmem->stream);
     raft::update_device(d_split_colidx, split_colidx, n_nodes, tempmem->stream);
   }
+  ML::POP_RANGE();
 }
 
 template <typename T>
diff --git a/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh b/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh
index 48c424ba6a..b0ee921bb4 100644
--- a/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh
+++ b/cpp/src/decisiontree/levelalgo/levelhelper_regressor.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ void initial_metric_regression(const T *labels, unsigned int *sample_cnt,
                                const int nrows, T &mean, unsigned int &count,
                                T &initial_metric,
                                std::shared_ptr<TemporaryMemory<T, T>> tempmem) {
+  ML::PUSH_RANGE(
+    "DecisionTree::initial_metric_classification @levelhelper_regressor.cuh");
   CUDA_CHECK(
     cudaMemsetAsync(tempmem->d_mseout->data(), 0, sizeof(T), tempmem->stream));
   CUDA_CHECK(
@@ -52,6 +54,7 @@ void initial_metric_regression(const T *labels, unsigned int *sample_cnt,
   count = tempmem->h_count->data()[0];
   mean = tempmem->h_predout->data()[0] / count;
   initial_metric = tempmem->h_mseout->data()[0] / count;
+  ML::POP_RANGE();
 }
 
 template <typename T>
@@ -63,6 +66,8 @@ void get_mse_regression_fused(const T *data, const T *labels,
                               std::shared_ptr<TemporaryMemory<T, T>> tempmem,
                               T *d_mseout, T *d_predout,
                               unsigned int *d_count) {
+  ML::PUSH_RANGE(
+    "DecisionTree::get_mse_regression_fused @levelhelper_regressor.cuh");
   size_t predcount = ncols_sampled * nbins * n_nodes;
   CUDA_CHECK(
     cudaMemsetAsync(d_mseout, 0, 2 * predcount * sizeof(T), tempmem->stream));
@@ -114,6 +119,7 @@ void get_mse_regression_fused(const T *data, const T *labels,
     }
     CUDA_CHECK(cudaGetLastError());
   }
+  ML::POP_RANGE();
 }
 template <typename T, typename F>
 void get_mse_regression(const T *data, const T *labels, unsigned int *flags,
@@ -123,6 +129,7 @@ void get_mse_regression(const T *data, const T *labels, unsigned int *flags,
                         const int split_algo,
                         std::shared_ptr<TemporaryMemory<T, T>> tempmem,
                         T *d_mseout, T *d_predout, unsigned int *d_count) {
+  ML::PUSH_RANGE("DecisionTree::get_mse_regression @levelhelper_regressor.cuh");
   size_t predcount = ncols_sampled * nbins * n_nodes;
   CUDA_CHECK(
     cudaMemsetAsync(d_mseout, 0, 2 * predcount * sizeof(T), tempmem->stream));
@@ -209,6 +216,7 @@ void get_mse_regression(const T *data, const T *labels, unsigned int *flags,
     }
     CUDA_CHECK(cudaGetLastError());
   }
+  ML::POP_RANGE();
 }
 template <typename T, typename Gain>
 void get_best_split_regression(
@@ -223,6 +231,7 @@ void get_best_split_regression(
   std::vector<int> &sparse_nodelist, int *split_colidx, int *split_binidx,
   int *d_split_colidx, int *d_split_binidx,
   std::shared_ptr<TemporaryMemory<T, T>> tempmem) {
+  ML::PUSH_RANGE("get_best_split_regression @levelhelper_regressor.cuh");
   T *quantile = nullptr;
   T *minmax = nullptr;
   if (tempmem->h_quantile != nullptr) quantile = tempmem->h_quantile->data();
@@ -395,6 +404,7 @@ void get_best_split_regression(
     raft::update_device(d_split_binidx, split_binidx, n_nodes, tempmem->stream);
     raft::update_device(d_split_colidx, split_colidx, n_nodes, tempmem->stream);
   }
+  ML::POP_RANGE();
 }
 
 template <typename T>
@@ -406,6 +416,7 @@ void leaf_eval_regression(float *gain, int curr_depth,
                           const int sparsesize, std::vector<T> &sparse_mean,
                           int &n_nodes_next, std::vector<int> &sparse_nodelist,
                           int &tree_leaf_cnt) {
+  ML::PUSH_RANGE("leaf_eval_regression @levelhelper_regressor.cuh");
   std::vector<int> tmp_sparse_nodelist(sparse_nodelist);
   sparse_nodelist.clear();
 
@@ -434,6 +445,7 @@ void leaf_eval_regression(float *gain, int curr_depth,
   int nleafed = tmp_sparse_nodelist.size() - non_leaf_counter;
   tree_leaf_cnt += nleafed;
   n_nodes_next = 2 * non_leaf_counter;
+  ML::POP_RANGE();
 }
 
 template <typename T>
@@ -466,6 +478,7 @@ void best_split_gather_regression(
   const float min_impurity_split,
   std::shared_ptr<TemporaryMemory<T, T>> tempmem,
   SparseTreeNode<T, T> *d_sparsenodes, int *d_nodelist) {
+  ML::PUSH_RANGE("get_best_split_gather_regression @levelhelper_regressor.cuh");
   const int TPB = TemporaryMemory<T, T>::gather_threads;
   if (split_cr == ML::CRITERION::MSE) {
     if (split_algo == ML::SPLIT_ALGO::HIST) {
@@ -508,6 +521,7 @@ void best_split_gather_regression(
     }
     CUDA_CHECK(cudaGetLastError());
   }
+  ML::POP_RANGE();
 }
 template <typename T>
 void make_leaf_gather_regression(
diff --git a/cpp/src/decisiontree/quantile/quantile.cuh b/cpp/src/decisiontree/quantile/quantile.cuh
index cf3a61d2ec..ae651074b6 100644
--- a/cpp/src/decisiontree/quantile/quantile.cuh
+++ b/cpp/src/decisiontree/quantile/quantile.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cub/cub.cuh>
 #include "quantile.h"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 namespace DecisionTree {
 
@@ -83,7 +85,9 @@ void preprocess_quantile(const T *data, const unsigned int *rowids,
 	int max_ncols = free_mem / (2 * n_sampled_rows * sizeof(T));
 	int batch_cols = (max_ncols > ncols) ? ncols : max_ncols;
 	ASSERT(max_ncols != 0, "Cannot preprocess quantiles due to insufficient device memory.");
-	*/
+  */
+
+  ML::PUSH_RANGE("preprocessing quantile @quantile.cuh");
   int batch_cols =
     1;  // Processing one column at a time, for now, until an appropriate getMemInfo function is provided for the deviceAllocator interface.
 
@@ -109,8 +113,10 @@ void preprocess_quantile(const T *data, const unsigned int *rowids,
                                                tempmem->stream, batch_cols + 1);
 
   blocks = raft::ceildiv(batch_cols + 1, threads);
+  ML::PUSH_RANGE("set_sorting_offset kernel @quantile.cuh");
   set_sorting_offset<<<blocks, threads, 0, tempmem->stream>>>(
     n_sampled_rows, batch_cols, d_offsets->data());
+  ML::POP_RANGE();
   CUDA_CHECK(cudaGetLastError());
 
   // Determine temporary device storage requirements
@@ -126,15 +132,18 @@ void preprocess_quantile(const T *data, const unsigned int *rowids,
 
   d_keys_out = new MLCommon::device_buffer<T>(tempmem->device_allocator,
                                               tempmem->stream, batch_items);
-
+  ML::PUSH_RANGE(
+    "DecisionTree::cub::DeviceRadixSort::SortKeys over batch_items "
+    "@quantile.cuh");
   CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
     d_temp_storage, temp_storage_bytes, d_keys_in, d_keys_out->data(),
     batch_items, 0, 8 * sizeof(T), tempmem->stream));
-
+  ML::POP_RANGE();
   // Allocate temporary storage
   d_temp_storage = new MLCommon::device_buffer<char>(
     tempmem->device_allocator, tempmem->stream, temp_storage_bytes);
 
+  ML::PUSH_RANGE("iterative quantile computation for each batch");
   // Compute quantiles for cur_batch_cols columns per loop iteration.
   for (int batch = 0; batch < batch_cnt; batch++) {
     int cur_batch_cols = (batch == batch_cnt - 1)
@@ -143,20 +152,24 @@ void preprocess_quantile(const T *data, const unsigned int *rowids,
 
     int batch_offset = batch * n_sampled_rows * batch_cols;
     int quantile_offset = batch * nbins * batch_cols;
-
+    ML::PUSH_RANGE("DeviceRadixSort::SortKeys");
     CUDA_CHECK(cub::DeviceRadixSort::SortKeys(
       (void *)d_temp_storage->data(), temp_storage_bytes,
       &d_keys_in[batch_offset], d_keys_out->data(), n_sampled_rows, 0,
       8 * sizeof(T), tempmem->stream));
+    ML::POP_RANGE();
 
     blocks = raft::ceildiv(cur_batch_cols * nbins, threads);
+    ML::PUSH_RANGE("get_all_quantiles kernel @quantile.cuh");
     get_all_quantiles<<<blocks, threads, 0, tempmem->stream>>>(
       d_keys_out->data(), &tempmem->d_quantile->data()[quantile_offset],
       n_sampled_rows, cur_batch_cols, nbins);
+    ML::POP_RANGE();
 
     CUDA_CHECK(cudaGetLastError());
     CUDA_CHECK(cudaStreamSynchronize(tempmem->stream));
   }
+  ML::POP_RANGE();
   raft::update_host(tempmem->h_quantile->data(), tempmem->d_quantile->data(),
                     nbins * ncols, tempmem->stream);
   d_keys_out->release(tempmem->stream);
@@ -165,6 +178,7 @@ void preprocess_quantile(const T *data, const unsigned int *rowids,
   delete d_keys_out;
   delete d_offsets;
   delete d_temp_storage;
+  ML::POP_RANGE();
 
   return;
 }
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index 4ec5bfb2a2..632e9a485d 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -524,6 +524,7 @@ ModelHandle concatenate_trees(std::vector<ModelHandle> treelite_handles) {
 void fit(const raft::handle_t& user_handle, RandomForestClassifierF*& forest,
          float* input, int n_rows, int n_cols, int* labels, int n_unique_labels,
          RF_params rf_params, int verbosity) {
+  ML::PUSH_RANGE("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(!forest->trees, "Cannot fit an existing forest.");
   forest->trees =
@@ -534,11 +535,13 @@ void fit(const raft::handle_t& user_handle, RandomForestClassifierF*& forest,
     std::make_shared<rfClassifier<float>>(rf_params);
   rf_classifier->fit(user_handle, input, n_rows, n_cols, labels,
                      n_unique_labels, forest);
+  ML::POP_RANGE();
 }
 
 void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest,
          double* input, int n_rows, int n_cols, int* labels,
          int n_unique_labels, RF_params rf_params, int verbosity) {
+  ML::PUSH_RANGE("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(!forest->trees, "Cannot fit an existing forest.");
   forest->trees =
@@ -549,6 +552,7 @@ void fit(const raft::handle_t& user_handle, RandomForestClassifierD*& forest,
     std::make_shared<rfClassifier<double>>(rf_params);
   rf_classifier->fit(user_handle, input, n_rows, n_cols, labels,
                      n_unique_labels, forest);
+  ML::POP_RANGE();
 }
 /** @} */
 
@@ -693,6 +697,7 @@ RF_params set_rf_class_obj(int max_depth, int max_leaves, float max_features,
 void fit(const raft::handle_t& user_handle, RandomForestRegressorF*& forest,
          float* input, int n_rows, int n_cols, float* labels,
          RF_params rf_params, int verbosity) {
+  ML::PUSH_RANGE("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(!forest->trees, "Cannot fit an existing forest.");
   forest->trees =
@@ -702,11 +707,13 @@ void fit(const raft::handle_t& user_handle, RandomForestRegressorF*& forest,
   std::shared_ptr<rfRegressor<float>> rf_regressor =
     std::make_shared<rfRegressor<float>>(rf_params);
   rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, forest);
+  ML::POP_RANGE();
 }
 
 void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest,
          double* input, int n_rows, int n_cols, double* labels,
          RF_params rf_params, int verbosity) {
+  ML::PUSH_RANGE("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(!forest->trees, "Cannot fit an existing forest.");
   forest->trees =
@@ -716,6 +723,7 @@ void fit(const raft::handle_t& user_handle, RandomForestRegressorD*& forest,
   std::shared_ptr<rfRegressor<double>> rf_regressor =
     std::make_shared<rfRegressor<double>>(rf_params);
   rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, forest);
+  ML::POP_RANGE();
 }
 /** @} */
 
diff --git a/cpp/src/randomforest/randomforest_impl.cuh b/cpp/src/randomforest/randomforest_impl.cuh
index 04a0634749..091996f2cb 100644
--- a/cpp/src/randomforest/randomforest_impl.cuh
+++ b/cpp/src/randomforest/randomforest_impl.cuh
@@ -27,6 +27,8 @@
 #include <random/permute.cuh>
 #include "randomforest_impl.h"
 
+#include <common/nvtx.hpp>
+
 namespace ML {
 /**
  * @brief Construct rf (random forest) object.
@@ -68,6 +70,7 @@ void rf<T, L>::prepare_fit_per_tree(
   int tree_id, int n_rows, int n_sampled_rows, unsigned int* selected_rows,
   const int num_sms, const cudaStream_t stream,
   const std::shared_ptr<deviceAllocator> device_allocator) {
+  ML::PUSH_RANGE("bootstrapping row IDs @randomforest_impl.cuh");
   int rs = tree_id;
   if (rf_params.seed != 0) rs = rf_params.seed + tree_id;
 
@@ -82,6 +85,7 @@ void rf<T, L>::prepare_fit_per_tree(
     thrust::sequence(thrust::cuda::par.on(stream), selected_rows,
                      selected_rows + n_sampled_rows);
   }
+  ML::POP_RANGE();
 }
 
 template <typename T, typename L>
@@ -153,6 +157,7 @@ void rfClassifier<T>::fit(const raft::handle_t& user_handle, const T* input,
                           int n_rows, int n_cols, int* labels,
                           int n_unique_labels,
                           RandomForestMetaData<T, int>*& forest) {
+  ML::PUSH_RANGE("rfClassifer::fit @randomforest_impl.cuh");
   this->error_checking(input, labels, n_rows, n_cols, false);
 
   const raft::handle_t& handle = user_handle;
@@ -243,6 +248,8 @@ void rfClassifier<T>::fit(const raft::handle_t& user_handle, const T* input,
   }
 
   CUDA_CHECK(cudaStreamSynchronize(user_handle.get_stream()));
+
+  ML::POP_RANGE();
 }
 
 /**
@@ -433,6 +440,7 @@ template <typename T>
 void rfRegressor<T>::fit(const raft::handle_t& user_handle, const T* input,
                          int n_rows, int n_cols, T* labels,
                          RandomForestMetaData<T, T>*& forest) {
+  ML::PUSH_RANGE("rfRegressor::fit @randomforest_impl.cuh");
   this->error_checking(input, labels, n_rows, n_cols, false);
 
   const raft::handle_t& handle = user_handle;
@@ -519,6 +527,8 @@ void rfRegressor<T>::fit(const raft::handle_t& user_handle, const T* input,
   }
 
   CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
+
+  ML::POP_RANGE();
 }
 
 /**
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index c21ed72202..39908aa60c 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -47,6 +47,7 @@ from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
 
+from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop
 from cuml.raft.common.handle cimport handle_t
 cimport cuml.common.cuda
 
@@ -439,6 +440,8 @@ class RandomForestClassifier(BaseRandomForestModel,
             y to be of dtype int32. This will increase memory used for
             the method.
         """
+        nvtx_range_push("Fit RF-Classifier @randomforestclassifier.pyx")
+
         X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
                                                                 convert_dtype)
         cdef uintptr_t X_ptr, y_ptr
@@ -512,6 +515,7 @@ class RandomForestClassifier(BaseRandomForestModel,
         self.handle.sync()
         del X_m
         del y_m
+        nvtx_range_pop()
         return self
 
     @cuml.internals.api_base_return_array(get_output_dtype=True)
@@ -626,6 +630,7 @@ class RandomForestClassifier(BaseRandomForestModel,
         ----------
         y : {}
         """
+        nvtx_range_push("predict RF-Classifier @randomforestclassifier.pyx")
         if num_classes:
             warnings.warn("num_classes is deprecated and will be removed"
                           " in an upcoming version")
@@ -653,6 +658,7 @@ class RandomForestClassifier(BaseRandomForestModel,
                                            fil_sparse_format=fil_sparse_format,
                                            predict_proba=False)
 
+        nvtx_range_pop()
         return preds
 
     def _predict_get_all(self, X, convert_dtype=True) -> CumlArray:
@@ -859,6 +865,8 @@ class RandomForestClassifier(BaseRandomForestModel,
         accuracy : float
            Accuracy of the model [0.0 - 1.0]
         """
+
+        nvtx_range_push("score RF-Classifier @randomforestclassifier.pyx")
         cdef uintptr_t X_ptr, y_ptr
         _, n_rows, _, _ = \
             input_to_cuml_array(X, check_dtype=self.dtype,
@@ -913,6 +921,7 @@ class RandomForestClassifier(BaseRandomForestModel,
         self.handle.sync()
         del(y_m)
         del(preds_m)
+        nvtx_range_pop()
         return self.stats['accuracy']
 
     def get_summary_text(self):
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index 39ab17cde8..e7cadb7eb3 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -47,6 +47,7 @@ from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
 
+from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop
 from cuml.raft.common.handle cimport handle_t
 cimport cuml.common.cuda
 
@@ -419,6 +420,8 @@ class RandomForestRegressor(BaseRandomForestModel,
         Perform Random Forest Regression on the input data
 
         """
+        nvtx_range_push("Fit RF-Regressor @randomforestregressor.pyx")
+
         X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
                                                                 convert_dtype)
 
@@ -485,6 +488,7 @@ class RandomForestRegressor(BaseRandomForestModel,
         self.handle.sync()
         del X_m
         del y_m
+        nvtx_range_pop()
         return self
 
     def _predict_model_on_cpu(self, X, convert_dtype) -> CumlArray:
@@ -579,6 +583,7 @@ class RandomForestRegressor(BaseRandomForestModel,
         y : {}
 
         """
+        nvtx_range_push("predict RF-Regressor @randomforestregressor.pyx")
         if predict_model == "CPU":
             preds = self._predict_model_on_cpu(X, convert_dtype)
 
@@ -597,6 +602,7 @@ class RandomForestRegressor(BaseRandomForestModel,
                 convert_dtype=convert_dtype,
                 fil_sparse_format=fil_sparse_format)
 
+        nvtx_range_pop()
         return preds
 
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)'),
@@ -645,6 +651,7 @@ class RandomForestRegressor(BaseRandomForestModel,
         median_abs_error : float or
         mean_abs_error : float
         """
+        nvtx_range_push("score RF-Regressor @randomforestregressor.pyx")
         from cuml.metrics.regression import r2_score
 
         cdef uintptr_t y_ptr
@@ -710,6 +717,7 @@ class RandomForestRegressor(BaseRandomForestModel,
         self.handle.sync()
         del(y_m)
         del(preds_m)
+        nvtx_range_pop()
         return stats
 
     def get_summary_text(self):