Move NVTX range helpers to raft (#4445)

Move NVTX range helpers to raft and extend them a little bit. Corresponding raft PR: rapidsai/raft#416 . Authors: - Artem M. Chirkin (https://github.com/achirkin) Approvers: - Corey J. Nolet (https://github.com/cjnolet) URL: #4445
rapidsai · Dec 17, 2021 · d630156 · d630156
1 parent b9b97fc
commit d630156
Show file tree

Hide file tree

Showing 24 changed files with 192 additions and 457 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -241,8 +241,7 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   # common components
   add_library(${CUML_CPP_TARGET} SHARED
-              src/common/logger.cpp
-              src/common/nvtx.cu)
+              src/common/logger.cpp)
 
   # FIL components
   target_sources(${CUML_CPP_TARGET}
@@ -367,7 +366,6 @@ if(BUILD_CUML_CPP_LIBRARY)
 
   target_compile_definitions(${CUML_CPP_TARGET}
     PUBLIC
-      $<$<BOOL:${NVTX}>:NVTX_ENABLED>
       DISABLE_CUSPARSE_DEPRECATED
     PRIVATE
       CUML_CPP_API
@@ -407,7 +405,6 @@ if(BUILD_CUML_CPP_LIBRARY)
       CUDA::cudart
       CUDA::cusparse
       GPUTreeShap::GPUTreeShap
-      $<$<BOOL:${NVTX}>:CUDA::nvToolsExt>
       $<$<BOOL:${LINK_FAISS}>:FAISS::FAISS>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_static,treelite::treelite>
       $<IF:$<BOOL:${Treelite_ADDED}>,treelite::treelite_runtime_static,treelite::treelite_runtime>

diff --git a/cpp/cmake/thirdparty/get_raft.cmake b/cpp/cmake/thirdparty/get_raft.cmake
@@ -34,8 +34,9 @@ function(find_and_configure_raft)
             GIT_TAG        ${PKG_PINNED_TAG}
             SOURCE_SUBDIR  cpp
             OPTIONS
-            "BUILD_TESTS OFF"
-            )
+              "BUILD_TESTS OFF"
+              "NVTX ${NVTX}"
+    )
 
     if(raft_ADDED)
         message(VERBOSE "CUML: Using RAFT located in ${raft_SOURCE_DIR}")
@@ -58,4 +59,4 @@ set(CUML_BRANCH_VERSION_raft "${CUML_VERSION_MAJOR}.${CUML_VERSION_MINOR}")
 find_and_configure_raft(VERSION    ${CUML_MIN_VERSION_raft}
         FORK       rapidsai
         PINNED_TAG branch-${CUML_BRANCH_VERSION_raft}
-        )
+        )
diff --git a/cpp/src/arima/batched_arima.cu b/cpp/src/arima/batched_arima.cu
@@ -29,9 +29,9 @@
 #include <cuml/tsa/batched_kalman.hpp>
 
 #include <raft/cudart_utils.h>
-#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <metrics/batched/information_criterion.cuh>
+#include <raft/common/nvtx.hpp>
 #include <raft/cuda_utils.cuh>
 #include <raft/handle.hpp>
 #include <raft/linalg/matrix_vector_op.cuh>
@@ -104,7 +104,7 @@ void predict(raft::handle_t& handle,
              double* d_lower,
              double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   const auto stream = handle.get_stream();
 
   bool diff     = order.need_diff() && pre_diff && level == 0;
@@ -245,8 +245,6 @@ void predict(raft::handle_t& handle,
       });
     /// TODO: 2D copy kernel?
   }
-
-  ML::POP_RANGE();
 }
 
 /**
@@ -360,7 +358,7 @@ void conditional_sum_of_squares(raft::handle_t& handle,
                                 double* d_loglike,
                                 int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream = handle.get_stream();
 
   int n_phi     = order.n_phi();
@@ -393,8 +391,6 @@ void conditional_sum_of_squares(raft::handle_t& handle,
                                                                                start_y,
                                                                                start_v);
   CUDA_CHECK(cudaPeekAtLastError());
-
-  ML::POP_RANGE();
 }
 
 void batched_loglike(raft::handle_t& handle,
@@ -417,7 +413,7 @@ void batched_loglike(raft::handle_t& handle,
                      double* d_lower,
                      double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto stream = handle.get_stream();
 
@@ -473,7 +469,6 @@ void batched_loglike(raft::handle_t& handle,
     /* Tranfer log-likelihood device -> host */
     raft::update_host(loglike, d_loglike, batch_size, stream);
   }
-  ML::POP_RANGE();
 }
 
 void batched_loglike(raft::handle_t& handle,
@@ -490,7 +485,7 @@ void batched_loglike(raft::handle_t& handle,
                      LoglikeMethod method,
                      int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   // unpack parameters
   auto stream = handle.get_stream();
@@ -518,8 +513,6 @@ void batched_loglike(raft::handle_t& handle,
                   host_loglike,
                   method,
                   truncate);
-
-  ML::POP_RANGE();
 }
 
 void batched_loglike_grad(raft::handle_t& handle,
@@ -536,7 +529,7 @@ void batched_loglike_grad(raft::handle_t& handle,
                           LoglikeMethod method,
                           int truncate)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream   = handle.get_stream();
   auto counting = thrust::make_counting_iterator(0);
   int N         = order.complexity();
@@ -597,8 +590,6 @@ void batched_loglike_grad(raft::handle_t& handle,
         d_x_pert[N * bid + i] = d_x[N * bid + i];
       });
   }
-
-  ML::POP_RANGE();
 }
 
 void information_criterion(raft::handle_t& handle,
@@ -612,7 +603,7 @@ void information_criterion(raft::handle_t& handle,
                            double* d_ic,
                            int ic_type)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   auto stream = handle.get_stream();
 
   /* Compute log-likelihood in d_ic */
@@ -628,8 +619,6 @@ void information_criterion(raft::handle_t& handle,
     batch_size,
     n_obs - order.n_diff(),
     stream);
-
-  ML::POP_RANGE();
 }
 
 /**
@@ -962,7 +951,7 @@ void estimate_x0(raft::handle_t& handle,
                  const ARIMAOrder& order,
                  bool missing)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
   const auto& handle_impl = handle;
   auto stream             = handle_impl.get_stream();
   auto cublas_handle      = handle_impl.get_cublas_handle();
@@ -1007,7 +996,6 @@ void estimate_x0(raft::handle_t& handle,
 
   // Do the computation of the initial parameters
   _start_params(handle, params, bm_yd, bm_exog_diff, order);
-  ML::POP_RANGE();
 }
 
 }  // namespace ML
diff --git a/cpp/src/arima/batched_kalman.cu b/cpp/src/arima/batched_kalman.cu
@@ -30,9 +30,9 @@
 #include <raft/linalg/binary_op.cuh>
 #include <rmm/device_uvector.hpp>
 
-#include <common/nvtx.hpp>
 #include <linalg/batched/matrix.cuh>
 #include <linalg/block.cuh>
+#include <raft/common/nvtx.hpp>
 #include <timeSeries/arima_helpers.cuh>
 
 namespace ML {
@@ -1283,7 +1283,7 @@ void _batched_kalman_filter(raft::handle_t& handle,
   MLCommon::LinAlg::Batched::b_gemm(false, true, rd, rd, 1, 1.0, RQb, Rb, 0.0, RQR);
 
   // Durbin Koopman "Time Series Analysis" pg 138
-  ML::PUSH_RANGE("Init P");
+  raft::common::nvtx::push_range("Init P");
   MLCommon::LinAlg::Batched::Matrix<double> P(
     rd, rd, batch_size, cublasHandle, arima_mem.P_batches, arima_mem.P_dense, stream, true);
   {
@@ -1326,7 +1326,7 @@ void _batched_kalman_filter(raft::handle_t& handle,
       _lyapunov_wrapper(handle, arima_mem, Tb, RQR, P, rd);
     }
   }
-  ML::POP_RANGE();
+  raft::common::nvtx::pop_range();
 
   // Initialize the state alpha by solving (I - T*) x* = c with:
   //     | mu |
@@ -1442,7 +1442,7 @@ void init_batched_kalman_matrices(raft::handle_t& handle,
                                   double* d_R_b,
                                   double* d_T_b)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto stream = handle.get_stream();
 
@@ -1535,8 +1535,6 @@ void init_batched_kalman_matrices(raft::handle_t& handle,
     // If rd=2 and phi_2=-1, I-TxT is singular
     if (rd == 2 && order.p == 2 && abs(batch_T[1] + 1) < 0.01) { batch_T[1] = -0.99; }
   });
-
-  ML::POP_RANGE();
 }
 
 void batched_kalman_filter(raft::handle_t& handle,
@@ -1556,7 +1554,7 @@ void batched_kalman_filter(raft::handle_t& handle,
                            double* d_lower,
                            double* d_upper)
 {
-  ML::PUSH_RANGE(__func__);
+  raft::common::nvtx::range fun_scope(__func__);
 
   auto cublasHandle = handle.get_cublas_handle();
   auto stream       = handle.get_stream();
@@ -1607,8 +1605,6 @@ void batched_kalman_filter(raft::handle_t& handle,
                          level,
                          d_lower,
                          d_upper);
-
-  ML::POP_RANGE();
 }
 
 void batched_jones_transform(raft::handle_t& handle,