diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 52e0b1bc3f..e3194c0e54 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -241,8 +241,7 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   # common components
   add_library(${CUML_CPP_TARGET} SHARED
-              src/common/logger.cpp
-              src/common/nvtx.cu)
+              src/common/logger.cpp)
 
   # FIL components
   target_sources(${CUML_CPP_TARGET}
@@ -367,7 +366,6 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   target_compile_definitions(${CUML_CPP_TARGET}
     PUBLIC
-      $<$<BOOL:${NVTX}>:NVTX_ENABLED>
       DISABLE_CUSPARSE_DEPRECATED
     PRIVATE
       CUML_CPP_API
@@ -407,7 +405,6 @@ if(BUILD_CUML_CPP_LIBRARY)
       CUDA::cudart
       CUDA::cusparse
       GPUTreeShap::GPUTreeShap
-      $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
       $<$<BOOL:${LINK_FAISS}>:FAISS::FAISS>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_static,treelite::treelite>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_runtime_static,treelite::treelite_runtime>
diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
index 514d4b963b..c7282899ef 100644
--- a/cpp/cmake/thirdparty/get_raft.cmake
+++ b/cpp/cmake/thirdparty/get_raft.cmake
@@ -34,8 +34,9 @@ function(find_and_configure_raft)
             GIT_TAG        ${PKG_PINNED_TAG}
             SOURCE_SUBDIR  cpp
             OPTIONS
-            "BUILD_TESTS OFF"
-            )
+              "BUILD_TESTS OFF"
+              "NVTX ${NVTX}"
+    )
 
     if(raft_ADDED)
         message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}")
@@ -58,4 +59,4 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}")
 find_and_configure_raft(VERSION    ${CUML_MIN_VERSION_raft}
         FORK       rapidsai
         PINNED_TAG branch-${CUML_BRANCH_VERSION_raft}
-        )
\ No newline at end of file
+        )
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
index cb91831a7f..35cf3354c9 100644
--- a/cpp/src/arima/batched_arima.cu
+++ b/cpp/src/arima/batched_arima.cu
@@ -29,9 +29,9 @@
 #include <cuml/tsa/batched_kalman.hpp>
 
 #include <raft/cudart_utils.h>
-#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <metrics/batched/information_criterion.cuh>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -104,7 +104,7 @@ void predict(raft::handle_t& handle,
              double* d_lower,
              double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   const auto stream = handle.get_stream();
 
   bool diff     = order.need_diff() && pre_diff && level == 0;
@@ -245,8 +245,6 @@ void predict(raft::handle_t& handle,
       });
     /// TODO: 2D copy kernel?
   }
-
-  ML::POP_RANGE();
 }
 
 /**
@@ -360,7 +358,7 @@ void conditional_sum_of_squares(raft::handle_t& handle,
                                 double* d_loglike,
                                 int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream = handle.get_stream();
 
   int n_phi     = order.n_phi();
@@ -393,8 +391,6 @@ void conditional_sum_of_squares(raft::handle_t& handle,
                                                                                start_y,
                                                                                start_v);
   CUDA_CHECK(cudaPeekAtLastError());
-
-  ML::POP_RANGE();
 }
 
 void batched_loglike(raft::handle_t& handle,
@@ -417,7 +413,7 @@ void batched_loglike(raft::handle_t& handle,
                      double* d_lower,
                      double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto stream = handle.get_stream();
 
@@ -473,7 +469,6 @@ void batched_loglike(raft::handle_t& handle,
     /* Tranfer log-likelihood device -> host */
     raft::update_host(loglike, d_loglike, batch_size, stream);
   }
-  ML::POP_RANGE();
 }
 
 void batched_loglike(raft::handle_t& handle,
@@ -490,7 +485,7 @@ void batched_loglike(raft::handle_t& handle,
                      LoglikeMethod method,
                      int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   // unpack parameters
   auto stream = handle.get_stream();
@@ -518,8 +513,6 @@ void batched_loglike(raft::handle_t& handle,
                   host_loglike,
                   method,
                   truncate);
-
-  ML::POP_RANGE();
 }
 
 void batched_loglike_grad(raft::handle_t& handle,
@@ -536,7 +529,7 @@ void batched_loglike_grad(raft::handle_t& handle,
                           LoglikeMethod method,
                           int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream   = handle.get_stream();
   auto counting = thrust::make_counting_iterator(0);
   int N         = order.complexity();
@@ -597,8 +590,6 @@ void batched_loglike_grad(raft::handle_t& handle,
         d_x_pert[N * bid + i] = d_x[N * bid + i];
       });
   }
-
-  ML::POP_RANGE();
 }
 
 void information_criterion(raft::handle_t& handle,
@@ -612,7 +603,7 @@ void information_criterion(raft::handle_t& handle,
                            double* d_ic,
                            int ic_type)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream = handle.get_stream();
 
   /* Compute log-likelihood in d_ic */
@@ -628,8 +619,6 @@ void information_criterion(raft::handle_t& handle,
     batch_size,
     n_obs - order.n_diff(),
     stream);
-
-  ML::POP_RANGE();
 }
 
 /**
@@ -962,7 +951,7 @@ void estimate_x0(raft::handle_t& handle,
                  const ARIMAOrder& order,
                  bool missing)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   const auto& handle_impl = handle;
   auto stream             = handle_impl.get_stream();
   auto cublas_handle      = handle_impl.get_cublas_handle();
@@ -1007,7 +996,6 @@ void estimate_x0(raft::handle_t& handle,
 
   // Do the computation of the initial parameters
   _start_params(handle, params, bm_yd, bm_exog_diff, order);
-  ML::POP_RANGE();
 }
 
 }  // namespace ML
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
index 1c189dde13..01306b0862 100644
--- a/cpp/src/arima/batched_kalman.cu
+++ b/cpp/src/arima/batched_kalman.cu
@@ -30,9 +30,9 @@
 #include <raft/linalg/binary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <linalg/block.cuh>
+#include <raft/common/nvtx.hpp>
 #include <timeSeries/arima_helpers.cuh>
 
 namespace ML {
@@ -1283,7 +1283,7 @@ void _batched_kalman_filter(raft::handle_t& handle,
   MLCommon::LinAlg::Batched::b_gemm(false, true, rd, rd, 1, 1.0, RQb, Rb, 0.0, RQR);
 
   // Durbin Koopman "Time Series Analysis" pg 138
-  ML::PUSH_RANGE("Init P");
+  raft::common::nvtx::push_range("Init P");
   MLCommon::LinAlg::Batched::Matrix<double> P(
     rd, rd, batch_size, cublasHandle, arima_mem.P_batches, arima_mem.P_dense, stream, true);
   {
@@ -1326,7 +1326,7 @@ void _batched_kalman_filter(raft::handle_t& handle,
       _lyapunov_wrapper(handle, arima_mem, Tb, RQR, P, rd);
     }
   }
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   // Initialize the state alpha by solving (I - T*) x* = c with:
   //     | mu |
@@ -1442,7 +1442,7 @@ void init_batched_kalman_matrices(raft::handle_t& handle,
                                   double* d_R_b,
                                   double* d_T_b)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto stream = handle.get_stream();
 
@@ -1535,8 +1535,6 @@ void init_batched_kalman_matrices(raft::handle_t& handle,
     // If rd=2 and phi_2=-1, I-TxT is singular
     if (rd == 2 && order.p == 2 && abs(batch_T[1] + 1) < 0.01) { batch_T[1] = -0.99; }
   });
-
-  ML::POP_RANGE();
 }
 
 void batched_kalman_filter(raft::handle_t& handle,
@@ -1556,7 +1554,7 @@ void batched_kalman_filter(raft::handle_t& handle,
                            double* d_lower,
                            double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto cublasHandle = handle.get_cublas_handle();
   auto stream       = handle.get_stream();
@@ -1607,8 +1605,6 @@ void batched_kalman_filter(raft::handle_t& handle,
                          level,
                          d_lower,
                          d_upper);
-
-  ML::POP_RANGE();
 }
 
 void batched_jones_transform(raft::handle_t& handle,
diff --git a/cpp/src/common/nvtx.cu b/cpp/src/common/nvtx.cu
deleted file mode 100644
index 5f778e0bec..0000000000
--- a/cpp/src/common/nvtx.cu
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <stdint.h>
-#include <stdlib.h>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include "nvtx.hpp"
-
-namespace ML {
-
-/**
- * @brief An internal struct to store associated state with the color
- * generator
- */
-struct ColorGenState {
-  /** collection of all tagged colors generated so far */
-  static std::unordered_map<std::string, uint32_t> allColors;
-  /** mutex for accessing the above map */
-  static std::mutex mapMutex;
-  /** saturation */
-  static constexpr float S = 0.9f;
-  /** value */
-  static constexpr float V = 0.85f;
-  /** golden ratio */
-  static constexpr float Phi = 1.61803f;
-  /** inverse golden ratio */
-  static constexpr float InvPhi = 1.f / Phi;
-};
-
-std::unordered_map<std::string, uint32_t> ColorGenState::allColors;
-std::mutex ColorGenState::mapMutex;
-
-// all h, s, v are in range [0, 1]
-// Ref: http://en.wikipedia.org/wiki/HSL_and_HSV#Converting_to_RGB
-uint32_t hsv2rgb(float h, float s, float v)
-{
-  uint32_t out = 0xff000000u;
-  if (s <= 0.0f) { return out; }
-  // convert hue from [0, 1] range to [0, 360]
-  float h_deg = h * 360.f;
-  if (0.f > h_deg || h_deg >= 360.f) h_deg = 0.f;
-  h_deg /= 60.f;
-  int h_range = (int)h_deg;
-  float h_mod = h_deg - h_range;
-  float x     = v * (1.f - s);
-  float y     = v * (1.f - (s * h_mod));
-  float z     = v * (1.f - (s * (1.f - h_mod)));
-  float r, g, b;
-  switch (h_range) {
-    case 0:
-      r = v;
-      g = z;
-      b = x;
-      break;
-    case 1:
-      r = y;
-      g = v;
-      b = x;
-      break;
-    case 2:
-      r = x;
-      g = v;
-      b = z;
-      break;
-    case 3:
-      r = x;
-      g = y;
-      b = v;
-      break;
-    case 4:
-      r = z;
-      g = x;
-      b = v;
-      break;
-    case 5:
-    default:
-      r = v;
-      g = x;
-      b = y;
-      break;
-  }
-  out |= (uint32_t(r * 256.f) << 16);
-  out |= (uint32_t(g * 256.f) << 8);
-  out |= uint32_t(b * 256.f);
-  return out;
-}
-
-/**
- * @brief Helper method to generate 'visually distinct' colors.
- * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/
- * However, if an associated tag is passed, it will look up in its history for
- * any generated color against this tag and if found, just returns it, else
- * generates a new color, assigns a tag to it and stores it for future usage.
- * Such a thing is very useful for nvtx markers where the ranges associated
- * with a specific tag should ideally get the same color for the purpose of
- * visualizing it on nsight-systems timeline.
- * @param tag look for any previously generated colors with this tag or
- * associate the currently generated color with it
- * @return returns 32b RGB integer with alpha channel set of 0xff
- */
-uint32_t generateNextColor(const std::string& tag)
-{
-  std::lock_guard<std::mutex> guard(ColorGenState::mapMutex);
-  if (!tag.empty()) {
-    auto itr = ColorGenState::allColors.find(tag);
-    if (itr != ColorGenState::allColors.end()) { return itr->second; }
-  }
-  float h = rand() * 1.f / RAND_MAX;
-  h += ColorGenState::InvPhi;
-  if (h >= 1.f) h -= 1.f;
-  auto rgb = hsv2rgb(h, ColorGenState::S, ColorGenState::V);
-  if (!tag.empty()) { ColorGenState::allColors[tag] = rgb; }
-  return rgb;
-}
-
-#ifdef NVTX_ENABLED
-
-#include <nvToolsExt.h>
-
-nvtxDomainHandle_t domain = nvtxDomainCreateA("cuml_cpp");
-
-void PUSH_RANGE(const char* name, cudaStream_t stream)
-{
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  PUSH_RANGE(name);
-}
-
-void POP_RANGE(cudaStream_t stream)
-{
-  CUDA_CHECK(cudaStreamSynchronize(stream));
-  POP_RANGE();
-}
-
-void PUSH_RANGE(const char* name)
-{
-  nvtxEventAttributes_t eventAttrib = {0};
-  eventAttrib.version               = NVTX_VERSION;
-  eventAttrib.size                  = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  eventAttrib.colorType             = NVTX_COLOR_ARGB;
-  eventAttrib.color                 = generateNextColor(name);
-  eventAttrib.messageType           = NVTX_MESSAGE_TYPE_ASCII;
-  eventAttrib.message.ascii         = name;
-  nvtxDomainRangePushEx(domain, &eventAttrib);
-}
-
-void POP_RANGE() { nvtxDomainRangePop(domain); }
-
-#else  // NVTX_ENABLED
-
-void PUSH_RANGE(const char* name, cudaStream_t stream) {}
-
-void POP_RANGE(cudaStream_t stream) {}
-
-void PUSH_RANGE(const char* name) {}
-
-void POP_RANGE() {}
-
-#endif  // NVTX_ENABLED
-
-}  // end namespace ML
diff --git a/cpp/src/common/nvtx.hpp b/cpp/src/common/nvtx.hpp
index bf9d16ed8d..1d9a2a3336 100644
--- a/cpp/src/common/nvtx.hpp
+++ b/cpp/src/common/nvtx.hpp
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <raft/cudart_utils.h>
+#include <raft/common/nvtx.hpp>
 
 namespace ML {
 
@@ -25,21 +25,37 @@ namespace ML {
  * @param name range name
  * @param stream stream to synchronize
  */
-void PUSH_RANGE(const char* name, cudaStream_t stream);
+[[deprecated("Use new raft::common::nvtx::push_range from <raft/common/nvtx.hpp>")]] inline void
+PUSH_RANGE(const char* name, cudaStream_t stream)
+{
+  raft::common::nvtx::push_range(name);
+}
 
 /**
  * @brief Synchronize CUDA stream and pop the latest nvtx range
  * @param stream stream to synchronize
  */
-void POP_RANGE(cudaStream_t stream);
+[[deprecated("Use new raft::common::nvtx::pop_range from <raft/common/nvtx.hpp>")]] inline void
+POP_RANGE(cudaStream_t stream)
+{
+  raft::common::nvtx::pop_range();
+}
 
 /**
  * @brief Push a named nvtx range
  * @param name range name
  */
-void PUSH_RANGE(const char* name);
+[[deprecated("Use new raft::common::nvtx::push_range from <raft/common/nvtx.hpp>")]] inline void
+PUSH_RANGE(const char* name)
+{
+  raft::common::nvtx::push_range(name);
+}
 
 /** Pop the latest range */
-void POP_RANGE();
+[[deprecated("Use new raft::common::nvtx::pop_range from <raft/common/nvtx.hpp>")]] inline void
+POP_RANGE()
+{
+  raft::common::nvtx::pop_range();
+}
 
 }  // end namespace ML
diff --git a/cpp/src/dbscan/dbscan.cuh b/cpp/src/dbscan/dbscan.cuh
index 9d7c1061cc..467476070e 100644
--- a/cpp/src/dbscan/dbscan.cuh
+++ b/cpp/src/dbscan/dbscan.cuh
@@ -18,7 +18,7 @@
 
 #include "runner.cuh"
 
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 #include <cuml/cluster/dbscan.hpp>
 #include <cuml/common/logger.hpp>
@@ -108,7 +108,7 @@ void dbscanFitImpl(const raft::handle_t& handle,
                    cudaStream_t stream,
                    int verbosity)
 {
-  ML::PUSH_RANGE("ML::Dbscan::Fit");
+  raft::common::nvtx::range fun_scope("ML::Dbscan::Fit");
   ML::Logger::get().setLevel(verbosity);
   int algo_vd  = (metric == raft::distance::Precomputed) ? 2 : 1;
   int algo_adj = 1;
@@ -201,7 +201,6 @@ void dbscanFitImpl(const raft::handle_t& handle,
                               workspace.data(),
                               batch_size,
                               stream);
-  ML::POP_RANGE();
 }
 
 }  // namespace Dbscan
diff --git a/cpp/src/dbscan/mergelabels/tree_reduction.cuh b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
index 98ec5f8cb0..6128f9d7fd 100644
--- a/cpp/src/dbscan/mergelabels/tree_reduction.cuh
+++ b/cpp/src/dbscan/mergelabels/tree_reduction.cuh
@@ -18,7 +18,7 @@
 
 #include "runner.cuh"
 
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 #include <cuml/common/logger.hpp>
 
@@ -76,9 +76,8 @@ void tree_reduction(const raft::handle_t& handle,
 
     if (receiver) {
       CUML_LOG_DEBUG("--> Merge labels");
-      ML::PUSH_RANGE("Trace::Dbscan::MergeLabels");
+      raft::common::nvtx::range fun_scope("Trace::Dbscan::MergeLabels");
       MergeLabels::run<Index_>(handle, labels, labels_temp, mask, work_buffer, m, N, stream);
-      ML::POP_RANGE();
     }
 
     s *= 2;
diff --git a/cpp/src/dbscan/runner.cuh b/cpp/src/dbscan/runner.cuh
index 82d5c9bb14..9e0c2e6148 100644
--- a/cpp/src/dbscan/runner.cuh
+++ b/cpp/src/dbscan/runner.cuh
@@ -17,8 +17,8 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
-#include <common/nvtx.hpp>
 #include <label/classlabels.cuh>
+#include <raft/common/nvtx.hpp>
 #include <raft/sparse/csr.hpp>
 #include "adjgraph/runner.cuh"
 #include "corepoints/compute.cuh"
@@ -29,7 +29,7 @@
 
 #include <cuml/common/logger.hpp>
 
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 #include <label/classlabels.cuh>
 
@@ -188,15 +188,15 @@ std::size_t run(const raft::handle_t& handle,
       "- Batch %d / %ld (%ld samples)", i + 1, (unsigned long)n_batches, (unsigned long)n_points);
 
     CUML_LOG_DEBUG("--> Computing vertex degrees");
-    ML::PUSH_RANGE("Trace::Dbscan::VertexDeg");
+    raft::common::nvtx::push_range("Trace::Dbscan::VertexDeg");
     VertexDeg::run<Type_f, Index_>(
       handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream);
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
 
     CUML_LOG_DEBUG("--> Computing core point mask");
-    ML::PUSH_RANGE("Trace::Dbscan::CorePoints");
+    raft::common::nvtx::push_range("Trace::Dbscan::CorePoints");
     CorePoints::compute<Index_>(handle, vd, core_pts, min_pts, start_vertex_id, n_points, stream);
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
   }
   // 2. Exchange with the other workers
   if (opg) CorePoints::exchange(handle, core_pts, N, start_row, stream);
@@ -216,26 +216,26 @@ std::size_t run(const raft::handle_t& handle,
     // i==0 -> adj and vd for batch 0 already in memory
     if (i > 0) {
       CUML_LOG_DEBUG("--> Computing vertex degrees");
-      ML::PUSH_RANGE("Trace::Dbscan::VertexDeg");
+      raft::common::nvtx::push_range("Trace::Dbscan::VertexDeg");
       VertexDeg::run<Type_f, Index_>(
         handle, adj, vd, x, eps, N, D, algo_vd, start_vertex_id, n_points, stream);
-      ML::POP_RANGE();
+      raft::common::nvtx::pop_range();
     }
     raft::update_host(&curradjlen, vd + n_points, 1, stream);
     CUDA_CHECK(cudaStreamSynchronize(stream));
 
     CUML_LOG_DEBUG("--> Computing adjacency graph with %ld nnz.", (unsigned long)curradjlen);
-    ML::PUSH_RANGE("Trace::Dbscan::AdjGraph");
+    raft::common::nvtx::push_range("Trace::Dbscan::AdjGraph");
     if (curradjlen > maxadjlen || adj_graph.data() == NULL) {
       maxadjlen = curradjlen;
       adj_graph.resize(maxadjlen, stream);
     }
     AdjGraph::run<Index_>(
       handle, adj, vd, adj_graph.data(), curradjlen, ex_scan, N, algo_adj, n_points, stream);
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
 
     CUML_LOG_DEBUG("--> Computing connected components");
-    ML::PUSH_RANGE("Trace::Dbscan::WeakCC");
+    raft::common::nvtx::push_range("Trace::Dbscan::WeakCC");
     raft::sparse::weak_cc_batched<Index_>(
       i == 0 ? labels : labels_temp,
       ex_scan,
@@ -249,7 +249,7 @@ std::size_t run(const raft::handle_t& handle,
       [core_pts, N] __device__(Index_ global_id) {
         return global_id < N ? __ldg((char*)core_pts + global_id) : 0;
       });
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
 
     if (i > 0) {
       // The labels_temp array contains the labelling for the neighborhood
@@ -259,9 +259,9 @@ std::size_t run(const raft::handle_t& handle,
       // weak_cc_batched and skipping the merge step would lead to incorrect
       // results as described in #3094.
       CUML_LOG_DEBUG("--> Accumulating labels");
-      ML::PUSH_RANGE("Trace::Dbscan::MergeLabels");
+      raft::common::nvtx::push_range("Trace::Dbscan::MergeLabels");
       MergeLabels::run<Index_>(handle, labels, labels_temp, core_pts, work_buffer, m, N, stream);
-      ML::POP_RANGE();
+      raft::common::nvtx::pop_range();
     }
   }
 
@@ -273,16 +273,16 @@ std::size_t run(const raft::handle_t& handle,
 
   // Final relabel
   if (my_rank == 0) {
-    ML::PUSH_RANGE("Trace::Dbscan::FinalRelabel");
+    raft::common::nvtx::push_range("Trace::Dbscan::FinalRelabel");
     if (algo_ccl == 2) final_relabel(labels, N, stream);
     std::size_t nblks = raft::ceildiv<std::size_t>(N, TPB);
     relabelForSkl<Index_><<<nblks, TPB, 0, stream>>>(labels, N, MAX_LABEL);
     CUDA_CHECK(cudaPeekAtLastError());
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
 
     // Calculate the core_indices only if an array was passed in
     if (core_indices != nullptr) {
-      ML::PUSH_RANGE("Trace::Dbscan::CoreSampleIndices");
+      raft::common::nvtx::range fun_scope("Trace::Dbscan::CoreSampleIndices");
 
       // Create the execution policy
       auto thrust_exec_policy = handle.get_thrust_policy();
@@ -304,8 +304,6 @@ std::size_t run(const raft::handle_t& handle,
                       dev_core_pts,
                       dev_core_indices,
                       [=] __device__(const bool is_core_point) { return is_core_point; });
-
-      ML::POP_RANGE();
     }
   }
 
diff --git a/cpp/src/decisiontree/batched-levelalgo/builder.cuh b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
index 9b1a0169ba..f6fd7f26a2 100644
--- a/cpp/src/decisiontree/batched-levelalgo/builder.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/builder.cuh
@@ -26,8 +26,8 @@
 #include <raft/cuda_utils.cuh>
 #include "kernels/builder_kernels.cuh"
 
-#include <common/nvtx.hpp>
 #include <deque>
+#include <raft/common/nvtx.hpp>
 #include <utility>
 
 namespace ML {
@@ -243,7 +243,8 @@ struct Builder {
   auto workspaceSize() const
   {
     size_t d_wsize = 0, h_wsize = 0;
-    ML::PUSH_RANGE("Builder::workspaceSize @builder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope(
+      "Builder::workspaceSize @builder_base.cuh [batched-levelalgo]");
     auto max_batch   = params.max_batch_size;
     size_t nHistBins = max_batch * (params.n_bins) * n_blks_for_cols * input.numOutputs;
 
@@ -261,7 +262,6 @@ struct Builder {
       calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks);
     h_wsize += calculateAlignedBytes(sizeof(SplitT) * max_batch);  // splits
 
-    ML::POP_RANGE();
     return std::make_pair(d_wsize, h_wsize);
   }
 
@@ -274,7 +274,8 @@ struct Builder {
    */
   void assignWorkspace(char* d_wspace, char* h_wspace)
   {
-    ML::PUSH_RANGE("Builder::assignWorkspace @builder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope(
+      "Builder::assignWorkspace @builder_base.cuh [batched-levelalgo]");
     auto max_batch   = params.max_batch_size;
     auto n_col_blks  = n_blks_for_cols;
     size_t nHistBins = max_batch * (params.n_bins) * n_blks_for_cols * input.numOutputs;
@@ -303,12 +304,11 @@ struct Builder {
     h_wspace += calculateAlignedBytes(sizeof(WorkloadInfo<IdxT>) * max_blocks);
     h_splits = reinterpret_cast<SplitT*>(h_wspace);
     h_wspace += calculateAlignedBytes(sizeof(SplitT) * max_batch);
-    ML::POP_RANGE();
   }
 
   std::shared_ptr<DT::TreeMetaDataNode<DataT, LabelT>> train()
   {
-    ML::PUSH_RANGE("Builder::train @builder.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope("Builder::train @builder.cuh [batched-levelalgo]");
     MLCommon::TimerCPU timer;
     NodeQueue<DataT, LabelT> queue(params, this->maxNodes(), input.nSampledRows, input.numOutputs);
     while (queue.HasWork()) {
@@ -319,7 +319,6 @@ struct Builder {
     auto tree = queue.GetTree();
     this->SetLeafPredictions(tree, queue.GetInstanceRanges());
     tree->train_time = timer.getElapsedMilliseconds();
-    ML::POP_RANGE();
     return tree;
   }
 
@@ -349,7 +348,7 @@ struct Builder {
 
   auto doSplit(const std::vector<NodeWorkItem>& work_items)
   {
-    ML::PUSH_RANGE("Builder::doSplit @bulder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope("Builder::doSplit @bulder_base.cuh [batched-levelalgo]");
     // start fresh on the number of *new* nodes created in this batch
     CUDA_CHECK(cudaMemsetAsync(n_nodes, 0, sizeof(IdxT), builder_stream));
     initSplit<DataT, IdxT, TPB_DEFAULT>(splits, work_items.size(), builder_stream);
@@ -368,7 +367,7 @@ struct Builder {
 
     // create child nodes (or make the current ones leaf)
     auto smemSize = 2 * sizeof(IdxT) * TPB_DEFAULT;
-    ML::PUSH_RANGE("nodeSplitKernel @builder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::push_range("nodeSplitKernel @builder_base.cuh [batched-levelalgo]");
     nodeSplitKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
       <<<work_items.size(), TPB_DEFAULT, smemSize, builder_stream>>>(params.max_depth,
                                                                      params.min_samples_leaf,
@@ -379,10 +378,9 @@ struct Builder {
                                                                      d_work_items,
                                                                      splits);
     CUDA_CHECK(cudaGetLastError());
-    ML::POP_RANGE();
+    raft::common::nvtx::pop_range();
     raft::update_host(h_splits, splits, work_items.size(), builder_stream);
     CUDA_CHECK(cudaStreamSynchronize(builder_stream));
-    ML::POP_RANGE();
     return std::make_tuple(h_splits, work_items.size());
   }
 
@@ -406,7 +404,8 @@ struct Builder {
   void computeSplit(IdxT col, IdxT batchSize, size_t total_blocks, size_t large_blocks)
   {
     if (total_blocks == 0) return;
-    ML::PUSH_RANGE("Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range fun_scope(
+      "Builder::computeSplit @builder_base.cuh [batched-levelalgo]");
     auto nbins    = params.n_bins;
     auto nclasses = input.numOutputs;
     auto colBlks  = std::min(n_blks_for_cols, input.nSampledCols - col);
@@ -415,7 +414,8 @@ struct Builder {
     dim3 grid(total_blocks, colBlks, 1);
     int nHistBins = large_blocks * nbins * colBlks * nclasses;
     CUDA_CHECK(cudaMemsetAsync(hist, 0, sizeof(BinT) * nHistBins, builder_stream));
-    ML::PUSH_RANGE("computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]");
+    raft::common::nvtx::range kernel_scope(
+      "computeSplitClassificationKernel @builder_base.cuh [batched-levelalgo]");
     ObjectiveT objective(input.numOutputs, params.min_samples_leaf);
     computeSplitKernel<DataT, LabelT, IdxT, TPB_DEFAULT>
       <<<grid, TPB_DEFAULT, smemSize, builder_stream>>>(hist,
@@ -433,8 +433,6 @@ struct Builder {
                                                         treeid,
                                                         workload_info,
                                                         seed);
-    ML::POP_RANGE();  // computeSplitKernel
-    ML::POP_RANGE();  // Builder::computeSplit
   }
 
   // Set the leaf value predictions in batch
diff --git a/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh b/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
index 6c6714a949..b8efcd2c96 100644
--- a/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
+++ b/cpp/src/decisiontree/batched-levelalgo/quantiles.cuh
@@ -24,7 +24,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 namespace ML {
 namespace DT {
diff --git a/cpp/src/decisiontree/decisiontree.cuh b/cpp/src/decisiontree/decisiontree.cuh
index 1d8b398716..1f61ab84b5 100644
--- a/cpp/src/decisiontree/decisiontree.cuh
+++ b/cpp/src/decisiontree/decisiontree.cuh
@@ -28,11 +28,11 @@
 
 #include <algorithm>
 #include <climits>
-#include <common/nvtx.hpp>
 #include <iomanip>
 #include <locale>
 #include <map>
 #include <numeric>
+#include <raft/common/nvtx.hpp>
 #include <random>
 #include <vector>
 
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index 0334f72906..ae7e498892 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -94,7 +94,7 @@ void olsFit(const raft::handle_t& handle,
   int selectedAlgo = algo;
   if (n_cols > n_rows || n_cols == 1) selectedAlgo = 0;
 
-  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::ols-lstsq*", stream);
+  raft::common::nvtx::push_range("ML::GLM::olsFit/algo-%d", selectedAlgo);
   switch (selectedAlgo) {
     case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
     case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
@@ -104,7 +104,7 @@ void olsFit(const raft::handle_t& handle,
       ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
       break;
   }
-  ML::POP_RANGE(stream);
+  raft::common::nvtx::pop_range();
 
   if (fit_intercept) {
     postProcessData(handle,
diff --git a/cpp/src/randomforest/randomforest.cu b/cpp/src/randomforest/randomforest.cu
index bbfc3ed4ea..0ff4790e93 100644
--- a/cpp/src/randomforest/randomforest.cu
+++ b/cpp/src/randomforest/randomforest.cu
@@ -444,7 +444,7 @@ void fit(const raft::handle_t& user_handle,
          RF_params rf_params,
          int verbosity)
 {
-  ML::PUSH_RANGE("RF::fit @randomforest.cu");
+  raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(forest->trees.empty(), "Cannot fit an existing forest.");
   forest->trees.resize(rf_params.n_trees);
@@ -453,7 +453,6 @@ void fit(const raft::handle_t& user_handle,
   std::shared_ptr<RandomForest<float, int>> rf_classifier =
     std::make_shared<RandomForest<float, int>>(rf_params, RF_type::CLASSIFICATION);
   rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest);
-  ML::POP_RANGE();
 }
 
 void fit(const raft::handle_t& user_handle,
@@ -466,7 +465,7 @@ void fit(const raft::handle_t& user_handle,
          RF_params rf_params,
          int verbosity)
 {
-  ML::PUSH_RANGE("RF::fit @randomforest.cu");
+  raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(forest->trees.empty(), "Cannot fit an existing forest.");
   forest->trees.resize(rf_params.n_trees);
@@ -475,7 +474,6 @@ void fit(const raft::handle_t& user_handle,
   std::shared_ptr<RandomForest<double, int>> rf_classifier =
     std::make_shared<RandomForest<double, int>>(rf_params, RF_type::CLASSIFICATION);
   rf_classifier->fit(user_handle, input, n_rows, n_cols, labels, n_unique_labels, forest);
-  ML::POP_RANGE();
 }
 /** @} */
 
@@ -636,7 +634,7 @@ void fit(const raft::handle_t& user_handle,
          RF_params rf_params,
          int verbosity)
 {
-  ML::PUSH_RANGE("RF::fit @randomforest.cu");
+  raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(forest->trees.empty(), "Cannot fit an existing forest.");
   forest->trees.resize(rf_params.n_trees);
@@ -645,7 +643,6 @@ void fit(const raft::handle_t& user_handle,
   std::shared_ptr<RandomForest<float, float>> rf_regressor =
     std::make_shared<RandomForest<float, float>>(rf_params, RF_type::REGRESSION);
   rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, 1, forest);
-  ML::POP_RANGE();
 }
 
 void fit(const raft::handle_t& user_handle,
@@ -657,7 +654,7 @@ void fit(const raft::handle_t& user_handle,
          RF_params rf_params,
          int verbosity)
 {
-  ML::PUSH_RANGE("RF::fit @randomforest.cu");
+  raft::common::nvtx::range fun_scope("RF::fit @randomforest.cu");
   ML::Logger::get().setLevel(verbosity);
   ASSERT(forest->trees.empty(), "Cannot fit an existing forest.");
   forest->trees.resize(rf_params.n_trees);
@@ -666,7 +663,6 @@ void fit(const raft::handle_t& user_handle,
   std::shared_ptr<RandomForest<double, double>> rf_regressor =
     std::make_shared<RandomForest<double, double>>(rf_params, RF_type::REGRESSION);
   rf_regressor->fit(user_handle, input, n_rows, n_cols, labels, 1, forest);
-  ML::POP_RANGE();
 }
 /** @} */
 
diff --git a/cpp/src/randomforest/randomforest.cuh b/cpp/src/randomforest/randomforest.cuh
index 13deda9581..1c3d9da78d 100644
--- a/cpp/src/randomforest/randomforest.cuh
+++ b/cpp/src/randomforest/randomforest.cuh
@@ -16,7 +16,7 @@
 
 #pragma once
 
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 #include <decisiontree/treelite_util.h>
 #include <decisiontree/batched-levelalgo/quantiles.cuh>
@@ -52,7 +52,7 @@ class RandomForest {
                       rmm::device_uvector<int>* selected_rows,
                       const cudaStream_t stream)
   {
-    ML::PUSH_RANGE("bootstrapping row IDs @randomforest.cuh");
+    raft::common::nvtx::range fun_scope("bootstrapping row IDs @randomforest.cuh");
 
     // Hash these together so they are uncorrelated
     auto rs = DT::fnv1a32_basis;
@@ -67,7 +67,6 @@ class RandomForest {
       // Use all the samples from the dataset
       thrust::sequence(thrust::cuda::par.on(stream), selected_rows->begin(), selected_rows->end());
     }
-    ML::POP_RANGE();
   }
 
   void error_checking(const T* input, L* predictions, int n_rows, int n_cols, bool predict) const
@@ -121,7 +120,7 @@ class RandomForest {
            int n_unique_labels,
            RandomForestMetaData<T, L>*& forest)
   {
-    ML::PUSH_RANGE("RandomForest::fit @randomforest.cuh");
+    raft::common::nvtx::range fun_scope("RandomForest::fit @randomforest.cuh");
     this->error_checking(input, labels, n_rows, n_cols, false);
     const raft::handle_t& handle = user_handle;
     int n_sampled_rows           = 0;
@@ -188,7 +187,6 @@ class RandomForest {
     // Cleanup
     handle.sync_stream_pool();
     CUDA_CHECK(cudaStreamSynchronize(handle.get_stream()));
-    ML::POP_RANGE();
   }
 
   /**
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index c6d36d08b6..551fff0f12 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -28,9 +28,9 @@
 #include <thrust/device_ptr.h>
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <common/nvtx.hpp>
 #include <label/classlabels.cuh>
 #include <matrix/kernelfactory.cuh>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/map.cuh>
@@ -376,7 +376,7 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
   const int coefCols         = narrowDown(model.coefCols());
   const std::size_t coefRows = model.coefRows;
 
-  ML::PUSH_RANGE("Trace::LinearSVMModel::fit");
+  raft::common::nvtx::range fun_scope("Trace::LinearSVMModel::fit");
 
   auto nCols1 = nCols + int(params.fit_intercept && params.penalized_intercept);
   T iC        = params.C > 0 ? (1.0 / params.C) : 1.0;
@@ -504,7 +504,6 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
       raft::linalg::transpose(handle, ps1, model.probScale, 2, coefCols, stream);
   }
 
-  ML::POP_RANGE();
   return model;
 }
 
diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh
index f592f6b0cc..a169830e9e 100644
--- a/cpp/src/umap/runner.cuh
+++ b/cpp/src/umap/runner.cuh
@@ -45,7 +45,7 @@
 #include <raft/cuda_utils.cuh>
 
 #include <cuda_runtime.h>
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 
 namespace UMAPAlgo {
 
@@ -92,7 +92,7 @@ void _fit(const raft::handle_t& handle,
           UMAPParams* params,
           value_t* embeddings)
 {
-  ML::PUSH_RANGE("umap::unsupervised::fit");
+  raft::common::nvtx::range fun_scope("umap::unsupervised::fit");
   cudaStream_t stream = handle.get_stream();
 
   int k = params->n_neighbors;
@@ -101,7 +101,7 @@ void _fit(const raft::handle_t& handle,
 
   CUML_LOG_DEBUG("n_neighbors=%d", params->n_neighbors);
 
-  ML::PUSH_RANGE("umap::knnGraph");
+  raft::common::nvtx::push_range("umap::knnGraph");
   std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
   std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
 
@@ -125,11 +125,11 @@ void _fit(const raft::handle_t& handle,
 
   kNNGraph::run<value_idx, value_t, umap_inputs>(
     handle, inputs, inputs, knn_graph, k, params, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");
 
-  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::common::nvtx::push_range("umap::simplicial_set");
   raft::sparse::COO<value_t> rgraph_coo(stream);
   FuzzySimplSet::run<TPB_X, value_idx, value_t>(
     inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo, params, stream);
@@ -140,12 +140,12 @@ void _fit(const raft::handle_t& handle,
    */
   raft::sparse::COO<value_t> cgraph_coo(stream);
   raft::sparse::op::coo_remove_zeros<value_t>(&rgraph_coo, &cgraph_coo, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   /**
    * Run initialization method
    */
-  ML::PUSH_RANGE("umap::embedding");
+  raft::common::nvtx::push_range("umap::embedding");
   InitEmbed::run(handle, inputs.n, inputs.d, &cgraph_coo, params, embeddings, stream, params->init);
 
   if (params->callback) {
@@ -157,10 +157,9 @@ void _fit(const raft::handle_t& handle,
    * Run simplicial set embedding to approximate low-dimensional representation
    */
   SimplSetEmbed::run<TPB_X, value_t>(inputs.n, inputs.d, &cgraph_coo, params, embeddings, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   if (params->callback) params->callback->on_train_end(embeddings);
-  ML::POP_RANGE();
 }
 
 template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
@@ -171,7 +170,7 @@ void _get_graph(const raft::handle_t& handle,
                                                         // second template argument for COO
 )
 {
-  ML::PUSH_RANGE("umap::supervised::_get_graph");
+  raft::common::nvtx::range fun_scope("umap::supervised::_get_graph");
   cudaStream_t stream = handle.get_stream();
 
   int k = params->n_neighbors;
@@ -180,7 +179,7 @@ void _get_graph(const raft::handle_t& handle,
 
   CUML_LOG_DEBUG("n_neighbors=%d", params->n_neighbors);
 
-  ML::PUSH_RANGE("umap::knnGraph");
+  raft::common::nvtx::push_range("umap::knnGraph");
   std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
   std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
 
@@ -204,11 +203,11 @@ void _get_graph(const raft::handle_t& handle,
 
   kNNGraph::run<value_idx, value_t, umap_inputs>(
     handle, inputs, inputs, knn_graph, k, params, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   CUML_LOG_DEBUG("Done. Calling fuzzy simplicial set");
 
-  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::common::nvtx::push_range("umap::simplicial_set");
   raft::sparse::COO<value_t> rgraph_coo(stream);
   FuzzySimplSet::run<TPB_X, value_idx, value_t>(
     inputs.n, knn_graph.knn_indices, knn_graph.knn_dists, k, &rgraph_coo, params, stream);
@@ -219,7 +218,7 @@ void _get_graph(const raft::handle_t& handle,
    * Remove zeros from simplicial set
    */
   raft::sparse::op::coo_remove_zeros<value_t>(&rgraph_coo, cgraph_coo, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 }
 
 template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
@@ -231,7 +230,7 @@ void _get_graph_supervised(
                                           // second template argument for COO
 )
 {
-  ML::PUSH_RANGE("umap::supervised::_get_graph_supervised");
+  raft::common::nvtx::range fun_scope("umap::supervised::_get_graph_supervised");
   cudaStream_t stream = handle.get_stream();
 
   int k = params->n_neighbors;
@@ -240,7 +239,7 @@ void _get_graph_supervised(
 
   if (params->target_n_neighbors == -1) params->target_n_neighbors = params->n_neighbors;
 
-  ML::PUSH_RANGE("umap::knnGraph");
+  raft::common::nvtx::push_range("umap::knnGraph");
   std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
   std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
 
@@ -263,12 +262,12 @@ void _get_graph_supervised(
   kNNGraph::run<value_idx, value_t, umap_inputs>(
     handle, inputs, inputs, knn_graph, k, params, stream);
 
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   /**
    * Allocate workspace for fuzzy simplicial set.
    */
-  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::common::nvtx::push_range("umap::simplicial_set");
   raft::sparse::COO<value_t> rgraph_coo(stream);
   raft::sparse::COO<value_t> tmp_coo(stream);
 
@@ -312,7 +311,7 @@ void _get_graph_supervised(
 
   raft::sparse::COO<value_t> ocoo(stream);
   raft::sparse::op::coo_remove_zeros<value_t>(cgraph_coo, &ocoo, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 }
 
 template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
@@ -335,7 +334,7 @@ void _fit_supervised(const raft::handle_t& handle,
                      UMAPParams* params,
                      value_t* embeddings)
 {
-  ML::PUSH_RANGE("umap::supervised::fit");
+  raft::common::nvtx::range fun_scope("umap::supervised::fit");
   cudaStream_t stream = handle.get_stream();
 
   int k = params->n_neighbors;
@@ -344,7 +343,7 @@ void _fit_supervised(const raft::handle_t& handle,
 
   if (params->target_n_neighbors == -1) params->target_n_neighbors = params->n_neighbors;
 
-  ML::PUSH_RANGE("umap::knnGraph");
+  raft::common::nvtx::push_range("umap::knnGraph");
   std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
   std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
 
@@ -367,12 +366,12 @@ void _fit_supervised(const raft::handle_t& handle,
   kNNGraph::run<value_idx, value_t, umap_inputs>(
     handle, inputs, inputs, knn_graph, k, params, stream);
 
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   /**
    * Allocate workspace for fuzzy simplicial set.
    */
-  ML::PUSH_RANGE("umap::simplicial_set");
+  raft::common::nvtx::push_range("umap::simplicial_set");
   raft::sparse::COO<value_t> rgraph_coo(stream);
   raft::sparse::COO<value_t> tmp_coo(stream);
 
@@ -418,12 +417,12 @@ void _fit_supervised(const raft::handle_t& handle,
 
   raft::sparse::COO<value_t> ocoo(stream);
   raft::sparse::op::coo_remove_zeros<value_t>(&final_coo, &ocoo, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   /**
    * Initialize embeddings
    */
-  ML::PUSH_RANGE("umap::supervised::fit");
+  raft::common::nvtx::push_range("umap::supervised::fit");
   InitEmbed::run(handle, inputs.n, inputs.d, &ocoo, params, embeddings, stream, params->init);
 
   if (params->callback) {
@@ -435,12 +434,11 @@ void _fit_supervised(const raft::handle_t& handle,
    * Run simplicial set embedding to approximate low-dimensional representation
    */
   SimplSetEmbed::run<TPB_X, value_t>(inputs.n, inputs.d, &ocoo, params, embeddings, stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   if (params->callback) params->callback->on_train_end(embeddings);
 
   CUDA_CHECK(cudaPeekAtLastError());
-  ML::POP_RANGE();
 }
 
 /**
@@ -455,7 +453,7 @@ void _transform(const raft::handle_t& handle,
                 UMAPParams* params,
                 value_t* transformed)
 {
-  ML::PUSH_RANGE("umap::transform");
+  raft::common::nvtx::range fun_scope("umap::transform");
   cudaStream_t stream = handle.get_stream();
 
   ML::Logger::get().setLevel(params->verbosity);
@@ -464,7 +462,7 @@ void _transform(const raft::handle_t& handle,
 
   CUML_LOG_DEBUG("Building KNN Graph");
 
-  ML::PUSH_RANGE("umap::knnGraph");
+  raft::common::nvtx::push_range("umap::knnGraph");
   std::unique_ptr<rmm::device_uvector<value_idx>> knn_indices_b = nullptr;
   std::unique_ptr<rmm::device_uvector<value_t>> knn_dists_b     = nullptr;
 
@@ -490,9 +488,9 @@ void _transform(const raft::handle_t& handle,
   kNNGraph::run<value_idx, value_t, umap_inputs>(
     handle, orig_x_inputs, inputs, knn_graph, k, params, stream);
 
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
-  ML::PUSH_RANGE("umap::smooth_knn");
+  raft::common::nvtx::push_range("umap::smooth_knn");
   float adjusted_local_connectivity = max(0.0, params->local_connectivity - 1.0);
 
   CUML_LOG_DEBUG("Smoothing KNN distances");
@@ -517,7 +515,7 @@ void _transform(const raft::handle_t& handle,
                                                                 params->n_neighbors,
                                                                 adjusted_local_connectivity,
                                                                 stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   /**
    * Compute graph of membership strengths
@@ -614,7 +612,7 @@ void _transform(const raft::handle_t& handle,
   raft::sparse::COO<value_t> comp_coo(stream);
   raft::sparse::op::coo_remove_zeros<value_t>(&graph_coo, &comp_coo, stream);
 
-  ML::PUSH_RANGE("umap::optimization");
+  raft::common::nvtx::push_range("umap::optimization");
   CUML_LOG_DEBUG("Computing # of epochs for training each sample");
 
   rmm::device_uvector<value_t> epochs_per_sample(nnz, stream);
@@ -643,10 +641,9 @@ void _transform(const raft::handle_t& handle,
                                                      params,
                                                      n_epochs,
                                                      stream);
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   if (params->callback) params->callback->on_train_end(transformed);
-  ML::POP_RANGE();
 }
 
 }  // namespace UMAPAlgo
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index c464f453c5..2465150555 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -21,7 +21,7 @@
 #include <raft/linalg/cusolver_wrappers.h>
 #include <raft/linalg/gemv.h>
 #include <raft/linalg/transpose.h>
-#include <common/nvtx.hpp>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/linalg/eig.cuh>
 #include <raft/linalg/eltwise.cuh>
@@ -301,9 +301,9 @@ void lstsqEig(const raft::handle_t& handle,
   multAbDone.record(multAbStream);
 
   // Q S Q* <- covA
-  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::lstsq::eigDC", mainStream);
+  raft::common::nvtx::push_range("raft::linalg::eigDC");
   raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
-  ML::POP_RANGE(mainStream);
+  raft::common::nvtx::pop_range();
 
   // QS  <- Q invS
   raft::linalg::matrixVectorOp(
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 6d08da5798..74f683948e 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -66,7 +66,6 @@ if(BUILD_CUML_TESTS)
         sg/linkage_test.cu
         sg/logger.cpp
         sg/multi_sum_test.cu
-        sg/nvtx_test.cpp
         sg/ols.cu
         sg/pca_test.cu
         sg/quasi_newton.cu
diff --git a/cpp/test/sg/nvtx_test.cpp b/cpp/test/sg/nvtx_test.cpp
deleted file mode 100644
index 5efd0e827c..0000000000
--- a/cpp/test/sg/nvtx_test.cpp
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-/**
- * tests for the functionality of generating next color based on string
- * entered in the NVTX Range marker wrappers
- */
-
-namespace ML {
-
-uint32_t generateNextColor(const std::string& tag);
-
-class nvtxNextColorTest : public ::testing::Test {
- protected:
-  void SetUp() override
-  {
-    const std::string temp1 = "foo";
-    const std::string temp2 = "bar";
-
-    if (ML::generateNextColor(temp1) != ML::generateNextColor(temp2)) diff_string_diff_color = true;
-    if (ML::generateNextColor(temp1) == ML::generateNextColor(temp1)) same_string_same_color = true;
-  }
-  void TearDown() {}
-  bool diff_string_diff_color = false;
-  bool same_string_same_color = false;
-};
-
-TEST_F(nvtxNextColorTest, nvtxGenerateNextColorTest)
-{
-  EXPECT_TRUE(diff_string_diff_color);
-  EXPECT_TRUE(same_string_same_color);
-}
-
-}  // end namespace ML
diff --git a/python/cuml/common/cuda.pyx b/python/cuml/common/cuda.pyx
index c99e697568..7624426bbb 100644
--- a/python/cuml/common/cuda.pyx
+++ b/python/cuml/common/cuda.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,37 +16,6 @@
 
 # distutils: language = c++
 
-import functools
-from libcpp.string cimport string
-
-cdef extern from "common/nvtx.hpp" namespace "ML":
-
-    void PUSH_RANGE(string msg)
-
-    void POP_RANGE()
-
-
-def nvtx_range_push(msg: str):
-    """Create a NVTX range with name `msg`"""
-    cdef string s = msg.encode("UTF-8")
-    PUSH_RANGE(s.c_str())
-
-
-def nvtx_range_pop():
-    """End a NVTX range"""
-    POP_RANGE()
-
-
-def nvtx_range_wrap(func):
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        nvtx_range_push(func.__name__)
-        result = func(*args, **kwargs)
-        nvtx_range_pop()
-        return result
-    return wrapper
-
-
 class CudaRuntimeError(RuntimeError):
     def __init__(self, extraMsg=None):
         cdef _Error e = cudaGetLastError()
diff --git a/python/cuml/ensemble/randomforestclassifier.pyx b/python/cuml/ensemble/randomforestclassifier.pyx
index a38f0ff772..bbfba5dfd8 100644
--- a/python/cuml/ensemble/randomforestclassifier.pyx
+++ b/python/cuml/ensemble/randomforestclassifier.pyx
@@ -17,6 +17,7 @@
 
 # distutils: language = c++
 import numpy as np
+import nvtx
 import rmm
 import warnings
 
@@ -46,7 +47,6 @@ from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
 
-from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop
 from cuml.raft.common.handle cimport handle_t
 cimport cuml.common.cuda
 
@@ -409,6 +409,9 @@ class RandomForestClassifier(BaseRandomForestModel,
                                  algo=algo,
                                  fil_sparse_format=fil_sparse_format)
 
+    @nvtx.annotate(
+        message="fit RF-Classifier @randomforestclassifier.pyx",
+        domain="cuml_python")
     @generate_docstring(skip_parameters_heading=True,
                         y='dense_intdtype',
                         convert_dtype_cast='np.float32')
@@ -426,7 +429,6 @@ class RandomForestClassifier(BaseRandomForestModel,
             y to be of dtype int32. This will increase memory used for
             the method.
         """
-        nvtx_range_push("Fit RF-Classifier @randomforestclassifier.pyx")
 
         X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
                                                                 convert_dtype)
@@ -497,7 +499,6 @@ class RandomForestClassifier(BaseRandomForestModel,
         self.handle.sync()
         del X_m
         del y_m
-        nvtx_range_pop()
         return self
 
     @cuml.internals.api_base_return_array(get_output_dtype=True)
@@ -547,6 +548,9 @@ class RandomForestClassifier(BaseRandomForestModel,
         del(X_m)
         return preds
 
+    @nvtx.annotate(
+        message="predict RF-Classifier @randomforestclassifier.pyx",
+        domain="cuml_python")
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
                            return_values=[('dense', '(n_samples, 1)')])
     def predict(self, X, predict_model="GPU", threshold=0.5,
@@ -599,7 +603,6 @@ class RandomForestClassifier(BaseRandomForestModel,
         ----------
         y : {}
         """
-        nvtx_range_push("predict RF-Classifier @randomforestclassifier.pyx")
         if predict_model == "CPU":
             preds = self._predict_model_on_cpu(X,
                                                convert_dtype=convert_dtype)
@@ -622,7 +625,6 @@ class RandomForestClassifier(BaseRandomForestModel,
                                            fil_sparse_format=fil_sparse_format,
                                            predict_proba=False)
 
-        nvtx_range_pop()
         return preds
 
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
@@ -687,6 +689,9 @@ class RandomForestClassifier(BaseRandomForestModel,
 
         return preds_proba
 
+    @nvtx.annotate(
+        message="score RF-Classifier @randomforestclassifier.pyx",
+        domain="cuml_python")
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)'),
                                        ('dense_intdtype', '(n_samples, 1)')])
     def score(self, X, y, threshold=0.5,
@@ -741,7 +746,6 @@ class RandomForestClassifier(BaseRandomForestModel,
            Accuracy of the model [0.0 - 1.0]
         """
 
-        nvtx_range_push("score RF-Classifier @randomforestclassifier.pyx")
         cdef uintptr_t X_ptr, y_ptr
         _, n_rows, _, _ = \
             input_to_cuml_array(X, check_dtype=self.dtype,
@@ -795,7 +799,6 @@ class RandomForestClassifier(BaseRandomForestModel,
         self.handle.sync()
         del(y_m)
         del(preds_m)
-        nvtx_range_pop()
         return self.stats['accuracy']
 
     def get_summary_text(self):
diff --git a/python/cuml/ensemble/randomforestregressor.pyx b/python/cuml/ensemble/randomforestregressor.pyx
index aa45af2543..6f4538531d 100644
--- a/python/cuml/ensemble/randomforestregressor.pyx
+++ b/python/cuml/ensemble/randomforestregressor.pyx
@@ -17,6 +17,7 @@
 # distutils: language = c++
 
 import numpy as np
+import nvtx
 import rmm
 import warnings
 
@@ -47,7 +48,6 @@ from libc.stdlib cimport calloc, malloc, free
 
 from numba import cuda
 
-from cuml.common.cuda import nvtx_range_wrap, nvtx_range_push, nvtx_range_pop
 from cuml.raft.common.handle cimport handle_t
 cimport cuml.common.cuda
 
@@ -409,6 +409,9 @@ class RandomForestRegressor(BaseRandomForestModel,
                                  algo=algo,
                                  fil_sparse_format=fil_sparse_format)
 
+    @nvtx.annotate(
+        message="fit RF-Regressor @randomforestregressor.pyx",
+        domain="cuml_python")
     @generate_docstring()
     @cuml.internals.api_base_return_any_skipall
     def fit(self, X, y, convert_dtype=True):
@@ -416,7 +419,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         Perform Random Forest Regression on the input data
 
         """
-        nvtx_range_push("Fit RF-Regressor @randomforestregressor.pyx")
 
         X_m, y_m, max_feature_val = self._dataset_setup_for_fit(X, y,
                                                                 convert_dtype)
@@ -480,7 +482,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         self.handle.sync()
         del X_m
         del y_m
-        nvtx_range_pop()
         return self
 
     def _predict_model_on_cpu(self, X, convert_dtype) -> CumlArray:
@@ -530,6 +531,9 @@ class RandomForestRegressor(BaseRandomForestModel,
         del(X_m)
         return preds
 
+    @nvtx.annotate(
+        message="predict RF-Regressor @randomforestclassifier.pyx",
+        domain="cuml_python")
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)')],
                            return_values=[('dense', '(n_samples, 1)')])
     def predict(self, X, predict_model="GPU",
@@ -578,7 +582,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         y : {}
 
         """
-        nvtx_range_push("predict RF-Regressor @randomforestregressor.pyx")
         if predict_model == "CPU":
             preds = self._predict_model_on_cpu(X, convert_dtype)
         elif self.dtype == np.float64:
@@ -598,9 +601,11 @@ class RandomForestRegressor(BaseRandomForestModel,
                 convert_dtype=convert_dtype,
                 fil_sparse_format=fil_sparse_format)
 
-        nvtx_range_pop()
         return preds
 
+    @nvtx.annotate(
+        message="score RF-Regressor @randomforestclassifier.pyx",
+        domain="cuml_python")
     @insert_into_docstring(parameters=[('dense', '(n_samples, n_features)'),
                                        ('dense', '(n_samples, 1)')])
     def score(self, X, y, algo='auto', convert_dtype=True,
@@ -650,7 +655,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         median_abs_error : float or
         mean_abs_error : float
         """
-        nvtx_range_push("score RF-Regressor @randomforestregressor.pyx")
         from cuml.metrics.regression import r2_score
 
         cdef uintptr_t y_ptr
@@ -716,7 +720,6 @@ class RandomForestRegressor(BaseRandomForestModel,
         self.handle.sync()
         del(y_m)
         del(preds_m)
-        nvtx_range_pop()
         return stats
 
     def get_summary_text(self):
diff --git a/python/cuml/tsa/batched_lbfgs.py b/python/cuml/tsa/batched_lbfgs.py
index f12bacc629..9f90dc62ea 100644
--- a/python/cuml/tsa/batched_lbfgs.py
+++ b/python/cuml/tsa/batched_lbfgs.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2021, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 import cuml.common.logger as logger
 import numpy as np
+import nvtx
 
-from cuml.common.cuda import nvtx_range_push, nvtx_range_pop
 from cuml.common import has_scipy
 
 
@@ -36,6 +36,9 @@ def _fd_fprime(x, f, h):
     return g
 
 
+@nvtx.annotate(
+    message="LBFGS",
+    domain="cuml_python")
 def batched_fmin_lbfgs_b(func, x0, num_batches, fprime=None, args=(),
                          bounds=None, m=10, factr=1e7, pgtol=1e-5,
                          epsilon=1e-8,
@@ -87,7 +90,6 @@ def batched_fmin_lbfgs_b(func, x0, num_batches, fprime=None, args=(),
     else:
         raise RuntimeError("Scipy is needed to run batched_fmin_lbfgs_b")
 
-    nvtx_range_push("LBFGS")
     n = len(x0) // num_batches
 
     if fprime is None:
@@ -140,50 +142,51 @@ def fprime_f(x):
     warn_flag = np.zeros(num_batches)
 
     while not all(converged):
-        nvtx_range_push("LBFGS-ITERATION")
-        for ib in range(num_batches):
-            if converged[ib]:
-                continue
-
-            _lbfgsb.setulb(m, x[ib],
-                           low_bnd, upper_bnd,
-                           nbd,
-                           f[ib], g[ib],
-                           factr, pgtol,
-                           wa[ib], iwa[ib],
-                           task[ib],
-                           iprint,
-                           csave[ib],
-                           lsave[ib],
-                           isave[ib],
-                           dsave[ib],
-                           maxls)
-
-        xk = np.concatenate(x)
-        fk = func(xk)
-        gk = fprime(xk)
-        for ib in range(num_batches):
-            if converged[ib]:
-                continue
-            task_str = task[ib].tobytes()
-            task_str_strip = task[ib].tobytes().strip(b'\x00').strip()
-            if task_str.startswith(b'FG'):
-                # needs function evalation
-                f[ib] = fk[ib]
-                g[ib] = gk[ib*n:(ib+1)*n]
-            elif task_str.startswith(b'NEW_X'):
-                n_iterations[ib] += 1
-                if n_iterations[ib] >= maxiter:
-                    task[ib][:] = 'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
-            elif task_str_strip.startswith(b'CONV'):
-                converged[ib] = True
-                warn_flag[ib] = 0
-            else:
-                converged[ib] = True
-                warn_flag[ib] = 2
-                continue
-
-        nvtx_range_pop()
+        with nvtx.annotate("LBFGS-ITERATION", domain="cuml_python"):
+            for ib in range(num_batches):
+                if converged[ib]:
+                    continue
+
+                _lbfgsb.setulb(
+                    m, x[ib],
+                    low_bnd, upper_bnd,
+                    nbd,
+                    f[ib], g[ib],
+                    factr, pgtol,
+                    wa[ib], iwa[ib],
+                    task[ib],
+                    iprint,
+                    csave[ib],
+                    lsave[ib],
+                    isave[ib],
+                    dsave[ib],
+                    maxls)
+
+            xk = np.concatenate(x)
+            fk = func(xk)
+            gk = fprime(xk)
+            for ib in range(num_batches):
+                if converged[ib]:
+                    continue
+                task_str = task[ib].tobytes()
+                task_str_strip = task[ib].tobytes().strip(b'\x00').strip()
+                if task_str.startswith(b'FG'):
+                    # needs function evalation
+                    f[ib] = fk[ib]
+                    g[ib] = gk[ib*n:(ib+1)*n]
+                elif task_str.startswith(b'NEW_X'):
+                    n_iterations[ib] += 1
+                    if n_iterations[ib] >= maxiter:
+                        task[ib][:] = \
+                            'STOP: TOTAL NO. of ITERATIONS REACHED LIMIT'
+                elif task_str_strip.startswith(b'CONV'):
+                    converged[ib] = True
+                    warn_flag[ib] = 0
+                else:
+                    converged[ib] = True
+                    warn_flag[ib] = 2
+                    continue
+
     xk = np.concatenate(x)
 
     if iprint > 0:
@@ -198,5 +201,4 @@ def fprime_f(x):
                     logger.info("WARNING: id={} convergence issue: {}".format(
                         ib, task[ib].tobytes()))
 
-    nvtx_range_pop()
     return xk, n_iterations, warn_flag