diff --git a/cpp/src/common/nvtx.hpp b/cpp/src/common/nvtx.hpp
index bf9d16ed8d..fda1cd660f 100644
--- a/cpp/src/common/nvtx.hpp
+++ b/cpp/src/common/nvtx.hpp
@@ -42,4 +42,81 @@ void PUSH_RANGE(const char* name);
 /** Pop the latest range */
 void POP_RANGE();
 
+/** Push a named nvtx range that would be popped at the end of the object lifetime. */
+class AUTO_RANGE {
+ private:
+  std::optional<rmm::cuda_stream_view> stream;
+
+  template <typename... Args>
+  void init(const char* name, Args... args)
+  {
+    if constexpr (sizeof...(args) > 0) {
+      int length = std::snprintf(nullptr, 0, name, args...);
+      assert(length >= 0);
+      auto buf = std::make_unique<char[]>(length + 1);
+      std::snprintf(buf.get(), length + 1, name, args...);
+
+      if (stream.has_value())
+        PUSH_RANGE(buf.get(), stream.value());
+      else
+        PUSH_RANGE(buf.get());
+    } else {
+      if (stream.has_value())
+        PUSH_RANGE(name, stream.value());
+      else
+        PUSH_RANGE(name);
+    }
+  }
+
+ public:
+  /**
+   * Synchronize CUDA stream and push a named nvtx range
+   * At the end of the object lifetime, synchronize again and pop the range.
+   *
+   * @param stream stream to synchronize
+   * @param name range name (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  AUTO_RANGE(rmm::cuda_stream_view stream, const char* name, Args... args)
+    : stream(std::make_optional(stream))
+  {
+    init(name, args...);
+  }
+
+  /**
+   * Push a named nvtx range.
+   * At the end of the object lifetime, pop the range back.
+   *
+   * @param name range name (accepts printf-style arguments)
+   * @param args the arguments for the printf-style formatting
+   */
+  template <typename... Args>
+  AUTO_RANGE(const char* name, Args... args) : stream(std::nullopt)
+  {
+    init(name, args...);
+  }
+
+  ~AUTO_RANGE()
+  {
+    if (stream.has_value())
+      POP_RANGE(stream.value());
+    else
+      POP_RANGE();
+  }
+};
+
+/*!
+  \def CUML_USING_RANGE(...)
+  When NVTX is enabled, push a named nvtx range and pop it at the end of the enclosing code block.
+
+  This macro initializes a dummy AUTO_RANGE variable on the stack,
+  which pushes the range in its constructor and pops it in the destructor.
+*/
+#ifdef NVTX_ENABLED
+#define CUML_USING_RANGE(...) ML::AUTO_RANGE _AUTO_RANGE_##__LINE__(__VA_ARGS__)
+#else
+#define CUML_USING_RANGE(...) (void)0
+#endif
+
 }  // end namespace ML
diff --git a/cpp/src/glm/ols.cuh b/cpp/src/glm/ols.cuh
index 0334f72906..1124435f4a 100644
--- a/cpp/src/glm/ols.cuh
+++ b/cpp/src/glm/ols.cuh
@@ -94,17 +94,18 @@ void olsFit(const raft::handle_t& handle,
   int selectedAlgo = algo;
   if (n_cols > n_rows || n_cols == 1) selectedAlgo = 0;
 
-  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::ols-lstsq*", stream);
-  switch (selectedAlgo) {
-    case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
-    case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
-    case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
-    case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
-    default:
-      ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
-      break;
+  {
+    CUML_USING_RANGE(stream, "ML::GLM::olsFit::impl-%d", selectedAlgo);
+    switch (selectedAlgo) {
+      case 0: LinAlg::lstsqSvdJacobi(handle, input, n_rows, n_cols, labels, coef, stream); break;
+      case 1: LinAlg::lstsqEig(handle, input, n_rows, n_cols, labels, coef, stream); break;
+      case 2: LinAlg::lstsqQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
+      case 3: LinAlg::lstsqSvdQR(handle, input, n_rows, n_cols, labels, coef, stream); break;
+      default:
+        ASSERT(false, "olsFit: no algorithm with this id (%d) has been implemented", algo);
+        break;
+    }
   }
-  ML::POP_RANGE(stream);
 
   if (fit_intercept) {
     postProcessData(handle,
diff --git a/cpp/src/glm/preprocess.cuh b/cpp/src/glm/preprocess.cuh
index 8ee77966c2..db173643ad 100644
--- a/cpp/src/glm/preprocess.cuh
+++ b/cpp/src/glm/preprocess.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <raft/cudart_utils.h>
+#include <common/nvtx.hpp>
 #include <raft/linalg/gemm.cuh>
 #include <raft/linalg/norm.cuh>
 #include <raft/matrix/math.hpp>
@@ -46,17 +47,22 @@ void preProcessData(const raft::handle_t& handle,
                     bool normalize,
                     cudaStream_t stream)
 {
+  CUML_USING_RANGE("ML::GLM::preProcessData-%d-%d", n_rows, n_cols);
   ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one");
   ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two");
 
   if (fit_intercept) {
-    raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream);
-    raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream);
+    {
+      CUML_USING_RANGE(stream, "ML::GLM::preProcessData-mean");
+      raft::stats::mean(mu_input, input, n_cols, n_rows, false, false, stream);
+      raft::stats::meanCenter(input, input, mu_input, n_cols, n_rows, false, true, stream);
 
-    raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream);
-    raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream);
+      raft::stats::mean(mu_labels, labels, 1, n_rows, false, false, stream);
+      raft::stats::meanCenter(labels, labels, mu_labels, 1, n_rows, false, true, stream);
+    }
 
     if (normalize) {
+      CUML_USING_RANGE(stream, "ML::GLM::preProcessData-normalize");
       raft::linalg::colNorm(norm2_input,
                             input,
                             n_cols,
@@ -86,6 +92,7 @@ void postProcessData(const raft::handle_t& handle,
                      bool normalize,
                      cudaStream_t stream)
 {
+  CUML_USING_RANGE("ML::GLM::postProcessData-%d-%d", n_rows, n_cols);
   ASSERT(n_cols > 0, "Parameter n_cols: number of columns cannot be less than one");
   ASSERT(n_rows > 1, "Parameter n_rows: number of rows cannot be less than two");
 
@@ -93,19 +100,31 @@ void postProcessData(const raft::handle_t& handle,
   rmm::device_scalar<math_t> d_intercept(stream);
 
   if (normalize) {
+    CUML_USING_RANGE(stream, "ML::GLM::postProcessData-denormalize");
     raft::matrix::matrixVectorBinaryMult(input, norm2_input, n_rows, n_cols, false, true, stream);
     raft::matrix::matrixVectorBinaryDivSkipZero(
       coef, norm2_input, 1, n_cols, false, true, stream, true);
   }
 
-  raft::linalg::gemm(
-    handle, mu_input, 1, n_cols, coef, d_intercept.data(), 1, 1, CUBLAS_OP_N, CUBLAS_OP_N, stream);
-
-  raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream);
-  *intercept = d_intercept.value(stream);
+  {
+    CUML_USING_RANGE(stream, "ML::GLM::postProcessData-shift");
+    raft::linalg::gemm(handle,
+                       mu_input,
+                       1,
+                       n_cols,
+                       coef,
+                       d_intercept.data(),
+                       1,
+                       1,
+                       CUBLAS_OP_N,
+                       CUBLAS_OP_N,
+                       stream);
 
-  raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream);
-  raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream);
+    raft::linalg::subtract(d_intercept.data(), mu_labels, d_intercept.data(), 1, stream);
+    *intercept = d_intercept.value(stream);
+    raft::stats::meanAdd(input, input, mu_input, n_cols, n_rows, false, true, stream);
+    raft::stats::meanAdd(labels, labels, mu_labels, 1, n_rows, false, true, stream);
+  }
 }
 
 };  // namespace GLM
diff --git a/cpp/src/svm/linear.cu b/cpp/src/svm/linear.cu
index be5401f211..36b7dc87b0 100644
--- a/cpp/src/svm/linear.cu
+++ b/cpp/src/svm/linear.cu
@@ -362,6 +362,8 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
                                          const T* y,
                                          const T* sampleWeight)
 {
+  CUML_USING_RANGE("ML::SVM::LinearSVMModel-%d-%d", nRows, nCols);
+
   cudaStream_t stream = handle.get_stream();
   rmm::device_uvector<T> classesBuf(0, stream);
   const std::size_t nClasses =
@@ -376,8 +378,6 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
   const int coefCols         = narrowDown(model.coefCols());
   const std::size_t coefRows = model.coefRows;
 
-  ML::PUSH_RANGE("Trace::LinearSVMModel::fit");
-
   auto nCols1 = nCols + int(params.fit_intercept && params.penalized_intercept);
   T iC        = params.C > 0 ? (1.0 / params.C) : 1.0;
 
@@ -504,7 +504,6 @@ LinearSVMModel<T> LinearSVMModel<T>::fit(const raft::handle_t& handle,
       raft::linalg::transpose(handle, ps1, model.probScale, 2, coefCols, stream);
   }
 
-  ML::POP_RANGE();
   return model;
 }
 
diff --git a/cpp/src_prims/linalg/lstsq.cuh b/cpp/src_prims/linalg/lstsq.cuh
index b920fa084c..b946bda1c0 100644
--- a/cpp/src_prims/linalg/lstsq.cuh
+++ b/cpp/src_prims/linalg/lstsq.cuh
@@ -301,9 +301,10 @@ void lstsqEig(const raft::handle_t& handle,
   multAbDone.record(multAbStream);
 
   // Q S Q* <- covA
-  ML::PUSH_RANGE("Trace::MLCommon::LinAlg::lstsq::eigDC", mainStream);
-  raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
-  ML::POP_RANGE(mainStream);
+  {
+    CUML_USING_RANGE("raft::linalg::eigDC", mainStream);
+    raft::linalg::eigDC(handle, covA, n_cols, n_cols, Q, S, mainStream);
+  }
 
   // QS  <- Q invS
   raft::linalg::matrixVectorOp(